All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.vmware.xenon.services.common.ConsistentHashingNodeSelectorService Maven / Gradle / Ivy

There is a newer version: 1.6.18
Show newest version
/*
 * Copyright (c) 2014-2015 VMware, Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License.  You may obtain a copy of
 * the License at http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, without warranties or
 * conditions of any kind, EITHER EXPRESS OR IMPLIED.  See the License for the
 * specific language governing permissions and limitations under the License.
 */

package com.vmware.xenon.services.common;

import java.net.URI;
import java.util.Collection;
import java.util.TreeMap;
import java.util.concurrent.CancellationException;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Consumer;
import java.util.stream.Collectors;

import com.vmware.xenon.common.FNVHash;
import com.vmware.xenon.common.NodeSelectorService;
import com.vmware.xenon.common.NodeSelectorService.SelectAndForwardRequest.ForwardingOption;
import com.vmware.xenon.common.NodeSelectorState;
import com.vmware.xenon.common.Operation;
import com.vmware.xenon.common.Operation.CompletionHandler;
import com.vmware.xenon.common.Service;
import com.vmware.xenon.common.ServiceClient;
import com.vmware.xenon.common.ServiceConfigUpdateRequest;
import com.vmware.xenon.common.ServiceConfiguration;
import com.vmware.xenon.common.ServiceDocument;
import com.vmware.xenon.common.ServiceErrorResponse;
import com.vmware.xenon.common.ServiceHost;
import com.vmware.xenon.common.StatelessService;
import com.vmware.xenon.common.UriUtils;
import com.vmware.xenon.common.Utils;
import com.vmware.xenon.services.common.NodeGroupService.NodeGroupChange;
import com.vmware.xenon.services.common.NodeGroupService.NodeGroupState;
import com.vmware.xenon.services.common.NodeGroupService.UpdateQuorumRequest;

/**
 * Uses consistent hashing to assign a client specified key to one
 * of the nodes in the node group. This service is associated with a specific node group
 */
public class ConsistentHashingNodeSelectorService extends StatelessService implements
        NodeSelectorService {

    private long operationQueueLimit = Service.OPERATION_QUEUE_DEFAULT_LIMIT;
    private AtomicLong pendingOperationCount = new AtomicLong();
    private ConcurrentLinkedQueue pendingRequestQueue = new ConcurrentLinkedQueue<>();

    // Cached node group state. Refreshed during maintenance
    private NodeGroupState cachedGroupState;

    // Cached initial state. This service has "soft" state: Its configured on start and then its state is immutable.
    // If the service host restarts, all state is lost, by design.
    // Note: This is not a recommended pattern! Regular services must not use instanced fields
    private NodeSelectorState cachedState;

    private NodeSelectorReplicationService replicationUtility;

    private volatile boolean isSynchronizationRequired;
    private volatile boolean isSetFactoriesAvailabilityRequired;
    private boolean isNodeGroupConverged;
    private int synchQuorumWarningCount;

    private static final class ClosestNNeighbours extends TreeMap {
        private static final long serialVersionUID = 0L;

        private final int maxN;

        public ClosestNNeighbours(int maxN) {
            super(Long::compare);
            this.maxN = maxN;
        }

        @Override
        public NodeState put(Long key, NodeState value) {
            if (size() < this.maxN) {
                return super.put(key, value);
            } else {
                // only attempt to write if new key can displace one of the top N entries
                if (comparator().compare(key, this.lastKey()) <= 0) {
                    NodeState old = super.put(key, value);
                    if (old == null) {
                        // sth. was added, remove last
                        this.remove(this.lastKey());
                    }

                    return old;
                }

                return null;
            }
        }
    }

    public ConsistentHashingNodeSelectorService() {
        super(NodeSelectorState.class);
        super.toggleOption(ServiceOption.CORE, true);
        super.toggleOption(ServiceOption.PERIODIC_MAINTENANCE, true);
        super.toggleOption(ServiceOption.INSTRUMENTATION, true);
    }

    @Override
    public void handleStart(Operation start) {
        NodeSelectorState state = null;
        if (!start.hasBody()) {
            state = new NodeSelectorState();
            state.nodeGroupLink = ServiceUriPaths.DEFAULT_NODE_GROUP;
        } else {
            state = start.getBody(NodeSelectorState.class);
        }

        getHost().getClient().setConnectionLimitPerTag(
                ServiceClient.CONNECTION_TAG_REPLICATION,
                NodeSelectorService.REPLICATION_TAG_CONNECTION_LIMIT);

        getHost().getClient().setConnectionLimitPerTag(
                ServiceClient.CONNECTION_TAG_SYNCHRONIZATION,
                NodeSelectorService.SYNCHRONIZATION_TAG_CONNECTION_LIMIT);

        getHost().getClient().setConnectionLimitPerTag(
                ServiceClient.CONNECTION_TAG_FORWARDING,
                FORWARDING_TAG_CONNECTION_LIMIT);

        state.documentSelfLink = getSelfLink();
        state.documentKind = Utils.buildKind(NodeSelectorState.class);
        state.documentOwner = getHost().getId();
        this.cachedState = state;
        this.replicationUtility = new NodeSelectorReplicationService(this);
        startHelperServices(start);
    }

    private void startHelperServices(Operation op) {
        allocateUtilityService();

        AtomicInteger remaining = new AtomicInteger(4);
        CompletionHandler h = (o, e) -> {
            if (e != null) {
                op.fail(e);
                return;
            }
            if (remaining.decrementAndGet() != 0) {
                return;
            }
            op.complete();
        };

        Operation subscribeToNodeGroup = Operation.createPost(
                UriUtils.buildSubscriptionUri(getHost(), this.cachedState.nodeGroupLink))
                .setCompletion(h)
                .setReferer(getUri());
        getHost().startSubscriptionService(subscribeToNodeGroup, handleNodeGroupNotification());

        // we subscribe to avoid GETs on node group state, per operation, but we need to have the initial
        // node group state, before service is available.
        sendRequest(Operation.createGet(this, this.cachedState.nodeGroupLink).setCompletion(
                (o, e) -> {
                    if (e == null) {
                        NodeGroupState ngs = o.getBody(NodeGroupState.class);
                        updateCachedNodeGroupState(ngs, null);
                    } else if (!getHost().isStopping()) {
                        logSevere(e);
                    }
                    h.handle(o, e);
                }));

        Operation startSynchPost = Operation.createPost(
                UriUtils.extendUri(getUri(), ServiceUriPaths.SERVICE_URI_SUFFIX_SYNCHRONIZATION))
                .setCompletion(h);
        Operation startForwardingPost = Operation.createPost(
                UriUtils.extendUri(getUri(), ServiceUriPaths.SERVICE_URI_SUFFIX_FORWARDING))
                .setCompletion(h);

        getHost().startService(startSynchPost, new NodeSelectorSynchronizationService(this));
        getHost().startService(startForwardingPost, new NodeSelectorForwardingService(this));
    }

    private Consumer handleNodeGroupNotification() {
        return (notifyOp) -> {
            notifyOp.complete();
            NodeGroupState ngs = null;
            if (notifyOp.getAction() == Action.PATCH) {
                UpdateQuorumRequest bd = notifyOp.getBody(UpdateQuorumRequest.class);
                if (UpdateQuorumRequest.KIND.equals(bd.kind)) {
                    updateCachedNodeGroupState(null, bd);
                    return;
                }
            } else if (notifyOp.getAction() != Action.POST) {
                return;
            }

            ngs = notifyOp.getBody(NodeGroupState.class);
            if (ngs.nodes == null || ngs.nodes.isEmpty()) {
                return;
            }
            updateCachedNodeGroupState(ngs, null);
        };
    }

    @Override
    public void authorizeRequest(Operation op) {
        if (op.getAction() != Action.POST && op.getAction() != Action.GET) {
            super.authorizeRequest(op);
            return;
        }

        // Authorize selection requests, they have no side effects other than CPU usage (and
        // back pressure can be used to throttle them). Forwarding requests will have
        // authorization applied on them as part of their target service processing
        op.complete();
    }

    @Override
    public void handleRequest(Operation op) {
        if (op.getAction() == Action.GET) {
            // this.cachedState might be stale if NodeSelector status is UNAVAILABLE.
            // Status gets actively updated if we are going from AVAILABLE TO UNAVAILABLE status.
            // But transition to AVAILABLE is lazy so we update it on GET request.
            if (!NodeSelectorState.isAvailable(this.cachedState)) {
                synchronized (this.cachedState) {
                    NodeSelectorState.updateStatus(getHost(), this.cachedGroupState, this.cachedState);
                }
            }
            op.setBody(this.cachedState).complete();
            return;
        }

        if (op.getAction() == Action.DELETE) {
            super.handleRequest(op);
            return;
        }

        // update to node selector state
        if (op.getAction() == Action.PATCH) {
            super.handleRequest(op);
            return;
        }

        if (op.getAction() != Action.POST) {
            Operation.failActionNotSupported(op);
            return;
        }

        if (!op.hasBody()) {
            op.fail(new IllegalArgumentException("Body is required"));
            return;
        }

        SelectAndForwardRequest body = op.getBody(SelectAndForwardRequest.class);
        if (body.key == null && body.targetPath == null) {
            op.fail(new IllegalArgumentException("key or targetPath is required"));
            return;
        }

        selectAndForward(op, body);
    }

    @Override
    public void handlePatch(Operation patch) {
        if (!patch.hasBody()) {
            patch.fail(new IllegalArgumentException("Body is required"));
            return;
        }
        ServiceDocument s = patch.getBody(ServiceDocument.class);
        if (s.documentKind == null) {
            patch.fail(new IllegalArgumentException("Kind is required"));
            return;
        }
        if (UpdateReplicationQuorumRequest.KIND.equals(s.documentKind)) {
            updateReplicationQuorum(patch, patch.getBody(UpdateReplicationQuorumRequest.class));
            return;
        }
        patch.fail(new IllegalArgumentException("Unexpected request kind " + s.documentKind));
    }

    @Override
    public void handleConfigurationRequest(Operation op) {
        if (op.getAction() == Action.PATCH && op.hasBody()) {
            ServiceConfigUpdateRequest body = op.getBody(ServiceConfigUpdateRequest.class);
            if (body.operationQueueLimit != null) {
                this.operationQueueLimit = body.operationQueueLimit;
            }
        }
        if (op.getAction() == Action.GET) {
            ServiceConfiguration cfg = Utils.buildServiceConfig(new ServiceConfiguration(), this);
            cfg.epoch = 0;
            cfg.operationQueueLimit = (int) this.operationQueueLimit;
            op.setBodyNoCloning(cfg).complete();
            return;
        }
        super.handleConfigurationRequest(op);
    }

    /**
     *  Infrastructure use only. Called by service host to determine the node group this selector is
     *  associated with.
     *
     *  If selectors become indexed services, this will need to be removed and the
     *  service host should do a asynchronous query or GET to retrieve the selector state. Since this
     *  is not an API a service author can call (they have no access to this instance), the change will
     *  be transparent to runtime users.
     */
    @Override
    public String getNodeGroupPath() {
        return this.cachedState.nodeGroupLink;
    }

    /**
     *  Infrastructure use only
     */
    @Override
    public void selectAndForward(Operation op, SelectAndForwardRequest body) {
        selectAndForward(body, op, this.cachedGroupState);
    }

    /**
     * Infrastructure use only
     *
     * This method uses cached {@link NodeGroupState}; therefore, caller needs to make sure the
     * nodegroup state is stable before calling this method.
     */
    @Override
    public SelectOwnerResponse findOwnerNode(String path) {
        return selectNodes(path, this.cachedGroupState);
    }

    /**
     * Uses the squared difference between the key and the server id of each member node to select a
     * node. Both the key and the nodes are hashed
     */
    private void selectAndForward(SelectAndForwardRequest forwardRequest, Operation op, NodeGroupState localState) {

        String keyValue = forwardRequest.key != null ? forwardRequest.key : forwardRequest.targetPath;
        forwardRequest.associatedOp = op;

        if (queueRequestIfNodeGroupIsUnavailable(localState, forwardRequest)) {
            return;
        }

        if (this.cachedState.replicationFactor == null && forwardRequest.options != null
                && forwardRequest.options.contains(ForwardingOption.BROADCAST)) {
            SelectOwnerResponse response = new SelectOwnerResponse();
            response.key = keyValue;
            response.selectedNodes = forwardRequest.candidateNodes == null ? localState.nodes.values()
                    : localState.nodes.values().stream().filter(
                            ns -> forwardRequest.candidateNodes.contains(ns.id))
                    .collect(Collectors.toList());
            if (forwardRequest.options.contains(ForwardingOption.REPLICATE)) {
                replicateRequest(op, forwardRequest, response);
                return;
            }
            broadcast(op, forwardRequest, response);
            return;
        }

        // select nodes and update response
        SelectOwnerResponse response = selectNodes(keyValue, localState);

        int quorum = this.cachedState.membershipQuorum;
        int availableNodeCount = response.availableNodeCount;
        if (availableNodeCount < quorum) {
            op.fail(new IllegalStateException("Available nodes: " + availableNodeCount + ", quorum:" + quorum));
            return;
        }


        if (forwardRequest.targetPath == null) {
            op.setBodyNoCloning(response).complete();
            return;
        }

        if (forwardRequest.options != null && forwardRequest.options.contains(ForwardingOption.BROADCAST)) {
            if (forwardRequest.options.contains(ForwardingOption.REPLICATE)) {
                if (op.getAction() == Action.DELETE) {
                    response.selectedNodes = localState.nodes.values();
                }
                replicateRequest(op, forwardRequest, response);
            } else {
                broadcast(op, forwardRequest, response);
            }
            return;
        }

        // If targetPath != null, we need to forward the operation.
        URI remoteService = UriUtils.buildServiceUri(response.ownerNodeGroupReference.getScheme(),
                response.ownerNodeGroupReference.getHost(),
                response.ownerNodeGroupReference.getPort(),
                forwardRequest.targetPath, forwardRequest.targetQuery, null);

        Operation fwdOp = op.clone()
                .setCompletion(
                        (o, e) -> {
                            op.transferResponseHeadersFrom(o).setStatusCode(o.getStatusCode())
                                    .setBodyNoCloning(o.getBodyRaw());
                            if (e != null) {
                                op.fail(e);
                                return;
                            }
                            op.complete();
                        });
        getHost().getClient().send(fwdOp.setUri(remoteService));
    }

    private SelectOwnerResponse selectNodes(String key, NodeGroupState localState) {
        return selectNodes(key, localState, null);
    }

    private SelectOwnerResponse selectNodes(String key, NodeGroupState localState,
            Collection candidateNodes) {
        NodeState self = localState.nodes.get(getHost().getId());
        SelectOwnerResponse response = new SelectOwnerResponse();
        response.key = key;

        if (localState.nodes.size() == 1) {
            response.ownerNodeId = self.id;
            response.isLocalHostOwner = true;
            response.ownerNodeGroupReference = self.groupReference;
            response.selectedNodes = localState.nodes.values();
            response.membershipUpdateTimeMicros = localState.membershipUpdateTimeMicros;
            response.availableNodeCount = 1;
            return response;
        }

        int neighbourCount = 1;
        if (this.cachedState.replicationFactor != null) {
            neighbourCount = this.cachedState.replicationFactor.intValue();
        }

        ClosestNNeighbours closestNodes = new ClosestNNeighbours(neighbourCount);

        long keyHash = FNVHash.compute(response.key);
        Collection nodeIds = candidateNodes != null ? candidateNodes : localState.nodes.keySet();
        for (String nodeId : nodeIds) {
            NodeState m = localState.nodes.get(nodeId);
            if (NodeState.isUnAvailable(m)) {
                continue;
            }

            response.availableNodeCount++;

            long distance = m.getNodeIdHash() - keyHash;
            distance *= distance;
            // We assume first key (smallest) will be one with closest distance. The hashing
            // function can return negative numbers however, so a distance of zero (closest) will
            // not be the first key. Take the absolute value to cover that case and create a logical
            // ring
            distance = Math.abs(distance);
            closestNodes.put(distance, m);
        }

        NodeState closest = closestNodes.firstEntry().getValue();
        response.ownerNodeId = closest.id;
        response.isLocalHostOwner = response.ownerNodeId.equals(getHost().getId());
        response.ownerNodeGroupReference = closest.groupReference;
        response.selectedNodes = closestNodes.values();
        response.membershipUpdateTimeMicros = localState.membershipUpdateTimeMicros;

        return response;
    }

    private void broadcast(Operation op, SelectAndForwardRequest req,
            SelectOwnerResponse selectRsp) {

        Collection members = selectRsp.selectedNodes;
        AtomicInteger remaining = new AtomicInteger(members.size());
        NodeGroupBroadcastResponse rsp = new NodeGroupBroadcastResponse();

        if (remaining.get() == 0) {
            op.setBody(rsp).complete();
            return;
        }

        rsp.membershipQuorum = this.cachedState.membershipQuorum;

        AtomicInteger availableNodeCount = new AtomicInteger();
        CompletionHandler c = (o, e) -> {
            // add failure or success response to the appropriate, concurrent map
            if (e != null) {
                ServiceErrorResponse errorRsp = Utils.toServiceErrorResponse(e);
                errorRsp.statusCode = o.getStatusCode();
                rsp.failures.put(o.getUri(), errorRsp);
            } else if (o != null && o.hasBody()) {
                rsp.jsonResponses.put(o.getUri(), Utils.toJson(o.getBodyRaw()));
            }

            if (remaining.decrementAndGet() != 0) {
                return;
            }
            rsp.nodeCount = this.cachedGroupState.nodes.size();
            rsp.availableNodeCount = availableNodeCount.get();
            op.setBodyNoCloning(rsp).complete();
        };

        for (NodeState m : members) {
            boolean skipNode = false;
            if (req.options.contains(ForwardingOption.EXCLUDE_ENTRY_NODE)
                    && m.id.equals(getHost().getId())) {
                skipNode = true;
            }

            skipNode = NodeState.isUnAvailable(m) | skipNode;

            if (skipNode) {
                c.handle(null, null);
                continue;
            }

            URI remoteService = UriUtils.buildUri(m.groupReference.getScheme(),
                    m.groupReference.getHost(),
                    m.groupReference.getPort(),
                    req.targetPath, req.targetQuery);

            // create a operation for the equivalent service instance on the
            // remote node
            Operation remoteOp = Operation.createPost(remoteService)
                    .transferRequestHeadersFrom(op)
                    .addPragmaDirective(Operation.PRAGMA_DIRECTIVE_NO_FORWARDING)
                    .setAction(op.getAction())
                    .setCompletion(c)
                    .transferRefererFrom(op)
                    .setExpiration(op.getExpirationMicrosUtc())
                    .setBody(op.getBodyRaw());

            rsp.receivers.add(remoteService);
            rsp.selectedNodes.put(m.id, m.groupReference);
            availableNodeCount.incrementAndGet();
            getHost().sendRequest(remoteOp);
        }
    }

    private void replicateRequest(Operation op, SelectAndForwardRequest body,
            SelectOwnerResponse response) {
        if (this.cachedGroupState == null) {
            op.fail(null);
        }
        this.replicationUtility.replicateUpdate(this.cachedGroupState, op, body, response, this.cachedState.replicationQuorum);
    }

    /**
     * Returns a value indicating whether request was queued. True means request is queued
     * and will be processed once the node group is available
     */
    private boolean queueRequestIfNodeGroupIsUnavailable(NodeGroupState localState,
            SelectAndForwardRequest body) {

        Operation op = body.associatedOp;
        if (getHost().isStopping()) {
            op.fail(new CancellationException("host is stopping"));
            return true;
        }

        if (op.getExpirationMicrosUtc() < Utils.getSystemNowMicrosUtc()) {
            // operation has expired
            op.fail(new CancellationException(String.format(
                    "Operation already expired, will not queue. Exp:%d, now:%d",
                    op.getExpirationMicrosUtc(), Utils.getSystemNowMicrosUtc())));
            return true;
        }

        if (!NodeSelectorState.isAvailable(this.cachedState)) {
            synchronized (this.cachedState) {
                NodeSelectorState.updateStatus(getHost(), localState, this.cachedState);
            }
        }

        if (NodeSelectorState.isAvailable(this.cachedState)) {
            return false;
        }

        // approximate check for queue limit (not atomic)
        if (this.operationQueueLimit <= this.pendingOperationCount.get()) {
            adjustStat(STAT_NAME_LIMIT_EXCEEDED_FAILED_REQUEST_COUNT, 1);
            Operation.failLimitExceeded(op,
                    ServiceErrorResponse.ERROR_CODE_SERVICE_QUEUE_LIMIT_EXCEEDED,
                    "pendingRequestQueue on " + getSelfLink());
            return true;
        }

        adjustStat(STAT_NAME_QUEUED_REQUEST_COUNT, 1);

        body.associatedOp = null;
        body = Utils.clone(body);
        body.associatedOp = op;

        this.pendingOperationCount.incrementAndGet();
        this.pendingRequestQueue.add(body);
        return true;
    }

    /**
     * Invoked by parent during its maintenance interval
     *
     * @param maintOp
     */
    @Override
    public void handleMaintenance(Operation maintOp) {
        performPendingRequestMaintenance();
        if (checkAndScheduleSynchronization(this.cachedGroupState.membershipUpdateTimeMicros,
                maintOp)) {
            return;
        }
        maintOp.complete();
    }

    private void performPendingRequestMaintenance() {
        if (this.pendingRequestQueue.isEmpty()) {
            return;
        }

        while (!this.pendingRequestQueue.isEmpty()) {
            if (!NodeSelectorState.isAvailable(this.cachedState)) {
                // update status in case group state changed
                NodeSelectorState.updateStatus(getHost(), this.cachedGroupState, this.cachedState);
                // Optimization: if the node group is not ready do not evaluate each
                // request. We check for availability in the selectAndForward method as well.
                return;
            }

            SelectAndForwardRequest req = this.pendingRequestQueue.poll();
            if (req == null) {
                break;
            }

            this.pendingOperationCount.decrementAndGet();

            if (getHost().isStopping()) {
                req.associatedOp.fail(new CancellationException("Host is stopping"));
                continue;
            }
            selectAndForward(req, req.associatedOp, this.cachedGroupState);
        }

    }

    private boolean checkAndScheduleSynchronization(long membershipUpdateMicros,
            Operation maintOp) {
        if (getHost().isStopping()) {
            return false;
        }

        if (!NodeGroupUtils.isMembershipSettled(getHost(), getHost().getMaintenanceIntervalMicros(),
                this.cachedGroupState)) {
            checkConvergence(membershipUpdateMicros, maintOp);
            return true;
        }

        if (!this.isNodeGroupConverged) {
            checkConvergence(membershipUpdateMicros, maintOp);
            return true;
        }

        if (this.isSynchronizationRequired) {
            this.isSynchronizationRequired = false;

            // to keep existing behavior, only update stats when peer synch is enabled
            if (getHost().isPeerSynchronizationEnabled()) {
                logInfo("Scheduling synchronization (%d nodes)", this.cachedGroupState.nodes.size());
                adjustStat(STAT_NAME_SYNCHRONIZATION_COUNT, 1);
            }
            getHost().scheduleNodeGroupChangeMaintenance(getSelfLink());
        }

        if (this.isSetFactoriesAvailabilityRequired) {
            this.isSetFactoriesAvailabilityRequired = false;
            logInfo("Setting factories availability on owner node");
            getHost().setFactoriesAvailabilityIfOwner(true);
        }

        return false;
    }

    private void checkConvergence(long membershipUpdateMicros, Operation maintOp) {

        CompletionHandler c = (o, e) -> {
            if (e != null) {
                if (!getHost().isStopping()) {
                    logSevere(e);
                }
                maintOp.complete();
                return;
            }

            final int quorumWarningsBeforeQuiet = 10;
            if (!o.hasBody()) {
                logWarning("Missing node group state");
                maintOp.complete();
                return;
            }
            NodeGroupState ngs = o.getBody(NodeGroupState.class);
            updateCachedNodeGroupState(ngs, null);
            Operation op = Operation.createPost(null)
                    .setReferer(getUri())
                    .setExpiration(
                            Utils.fromNowMicrosUtc(getHost().getOperationTimeoutMicros()));
            NodeGroupUtils
                    .checkConvergence(
                            getHost(),
                            ngs,
                            op.setCompletion((o1, e1) -> {
                                if (e1 != null) {
                                    logWarning("Failed convergence check, will retry: %s",
                                            e1.getMessage());
                                    maintOp.complete();
                                    return;
                                }

                                if (!NodeGroupUtils.hasMembershipQuorum(getHost(),
                                        this.cachedGroupState)) {
                                    if (this.synchQuorumWarningCount < quorumWarningsBeforeQuiet) {
                                        logWarning("Synchronization quorum not met");
                                    } else if (this.synchQuorumWarningCount == quorumWarningsBeforeQuiet) {
                                        logWarning("Synchronization quorum not met, warning will be silenced");
                                    }
                                    this.synchQuorumWarningCount++;
                                    maintOp.complete();
                                    return;
                                }

                                // if node group changed since we kicked of this check, we need to wait for
                                // newer convergence completions
                                synchronized (this.cachedState) {
                                    this.isNodeGroupConverged = membershipUpdateMicros == this.cachedGroupState.membershipUpdateTimeMicros;
                                    if (this.isNodeGroupConverged) {
                                        this.synchQuorumWarningCount = 0;
                                    }
                                }
                                maintOp.complete();
                            }));
        };

        sendRequest(Operation.createGet(this, this.cachedState.nodeGroupLink).setCompletion(c));
    }

    private void updateCachedNodeGroupState(NodeGroupState ngs, UpdateQuorumRequest quorumUpdate) {
        if (ngs != null) {
            NodeGroupState currentState = this.cachedGroupState;
            boolean isAvailable = NodeSelectorState.isAvailable(getHost(), ngs);
            boolean isCurrentlyAvailable = currentState != null
                    && NodeSelectorState.isAvailable(getHost(), currentState);
            boolean logMsg = isAvailable != isCurrentlyAvailable
                    || (currentState != null && currentState.nodes.size() != ngs.nodes.size());
            if (currentState != null && logMsg) {
                logInfo("Node count: %d, available: %s, update time: %d (%d)",
                        ngs.nodes.size(),
                        isAvailable,
                        ngs.membershipUpdateTimeMicros, ngs.localMembershipUpdateTimeMicros);
            }
        } else if (quorumUpdate.membershipQuorum != null) {
            logInfo("Quorum update: %d", quorumUpdate.membershipQuorum);
        }

        long now = Utils.getNowMicrosUtc();
        synchronized (this.cachedState) {
            this.cachedState.status = NodeSelectorState.Status.UNAVAILABLE;
            if (quorumUpdate != null) {
                this.cachedState.documentUpdateTimeMicros = now;
                if (quorumUpdate.membershipQuorum != null) {
                    this.cachedState.membershipQuorum = quorumUpdate.membershipQuorum;
                }
                if (this.cachedGroupState != null) {
                    if (quorumUpdate.membershipQuorum != null) {
                        this.cachedGroupState.nodes.get(
                                getHost().getId()).membershipQuorum = quorumUpdate.membershipQuorum;
                    }
                    if (quorumUpdate.locationQuorum != null) {
                        this.cachedGroupState.nodes.get(
                                getHost().getId()).locationQuorum = quorumUpdate.locationQuorum;
                    }
                }
                return;
            }

            if (this.cachedGroupState == null) {
                this.cachedGroupState = ngs;
            }

            if (this.cachedGroupState.documentUpdateTimeMicros <= ngs.documentUpdateTimeMicros) {
                NodeSelectorState.updateStatus(getHost(), ngs, this.cachedState);
                this.cachedState.documentUpdateTimeMicros = now;
                this.cachedState.membershipUpdateTimeMicros = ngs.membershipUpdateTimeMicros;
                this.cachedGroupState = ngs;
                // every time we update cached state, request convergence check
                this.isNodeGroupConverged = false;
                this.isSynchronizationRequired = true;
                // We skip synchronization in case of PEER_UNAVAILABLE because we will triggered synchronization
                // when that node will get EXPIRED. PEER_UNAVAILABLE indicates that the node just become unavailable
                // and will be expired after 5 minutes if it does not come back online within that time period.
                if (getHost().isPeerSynchronizationEnabled() &&
                        ngs.lastChanges != null &&
                        ngs.lastChanges.size() == 1 &&
                        (ngs.lastChanges.contains(NodeGroupChange.PEER_UNAVAILABLE) ||
                                ngs.lastChanges.contains(NodeGroupChange.PEER_EXPIRED))) {
                    this.isSynchronizationRequired = false;
                    if (ngs.lastChanges.contains(NodeGroupChange.PEER_EXPIRED)) {
                        // synchronization is not required, but we need to set factories' availability
                        // on hosts that own them because ownership has changed, and clients might
                        // dependent on up-to-date factory availability
                        this.isSetFactoriesAvailabilityRequired = true;
                    }
                }
            }
        }
    }

    @Override
    public void updateReplicationQuorum(Operation op, UpdateReplicationQuorumRequest r) {
        if (r.replicationQuorum == null) {
            op.fail(new IllegalArgumentException("replication quorum is required"));
            return;
        }
        int replicationQuorum = r.replicationQuorum;
        int replicationFactor = this.cachedState.replicationFactor != null ?
                this.cachedState.replicationFactor.intValue() : this.cachedGroupState.nodes.size();
        if (replicationQuorum > replicationFactor) {
            String errorMsg = String.format(
                    "replicationQuorum %d > replicationFactor %d", replicationQuorum, replicationFactor);
            op.fail(new IllegalArgumentException(errorMsg));
            return;
        }
        // broadcast
        logInfo("replicationQuorum update from %d to %d", this.cachedState.replicationQuorum, replicationQuorum);
        this.cachedState.replicationQuorum = replicationQuorum;
        if (!r.isGroupUpdate) {
            op.complete();
            return;
        }
        r.isGroupUpdate = false;
        AtomicInteger pending = new AtomicInteger(this.cachedGroupState.nodes.size());
        CompletionHandler c = (o, e) -> {
            if (e != null) {
                // fail the original update request if one peer update failed
                op.fail(e);
                return;
            }
            int p = pending.decrementAndGet();
            if (p != 0) {
                return;
            }
            op.complete();
        };
        for (NodeState node : this.cachedGroupState.nodes.values()) {
            if (!NodeState.isAvailable(node, getHost().getId(), true)) {
                c.handle(null, null);
                continue;
            }
            URI peerNodeSelectorService = UriUtils.buildUri(node.groupReference.getScheme(),
                    node.groupReference.getHost(),
                    node.groupReference.getPort(),
                    getSelfLink(), null);
            Operation p = Operation
                    .createPatch(peerNodeSelectorService)
                    .setBody(r)
                    .setCompletion(c);
            sendRequest(p);
        }
    }

    @Override
    public Service getUtilityService(String uriPath) {
        if (uriPath.endsWith(ServiceHost.SERVICE_URI_SUFFIX_REPLICATION)) {
            // update utility with latest set of peers
            return this.replicationUtility;
        } else {
            return super.getUtilityService(uriPath);
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy