oracle.kv.impl.api.rgstate.RepGroupState Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of oracle-nosql-server Show documentation
NoSQL Database Server - supplies build and runtime support for the server (store) side of the Oracle NoSQL Database.
There is a newer version: 18.3.10
/*-
 * Copyright (C) 2011, 2018 Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle NoSQL
 * Database made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle NoSQL Database for a copy of the license and
 * additional information.
 */

package oracle.kv.impl.api.rgstate;

import static oracle.kv.impl.util.ObjectUtil.checkNull;

import java.util.Collection;
import java.util.Collections;
import java.util.EnumMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.logging.Logger;

import oracle.kv.Consistency;
import oracle.kv.Consistency.Time;
import oracle.kv.Consistency.Version;
import oracle.kv.impl.api.Request;
import oracle.kv.impl.metadata.Metadata;
import oracle.kv.impl.metadata.Metadata.MetadataType;
import oracle.kv.impl.topo.RepGroup;
import oracle.kv.impl.topo.RepGroupId;
import oracle.kv.impl.topo.RepNode;
import oracle.kv.impl.topo.RepNodeId;
import oracle.kv.impl.topo.ResourceId;
import oracle.kv.impl.topo.StorageNode;
import oracle.kv.impl.topo.Topology;

import com.sleepycat.je.rep.ReplicatedEnvironment.State;

/**
 * RepGroupState represents the state associated with the replication group.
 * The only (non-structural) state associated with the rep group is the
 * identity of the node currently serving as the master and the time/seq#
 * associated that event.
 * 
 * There are two types of updates that can be made to the rep group state:
 * 

 * Structural changes arising from a change in Topology.
 * HA state changes in response to the change in the replication state
 * associated with nodes in the group.
 * Two overloaded update methods are defined by this class to deal with these
 * two types of group level changes.
 *
 * TODO:
 * 1) Configurable partition-aware dispatching as an option to exploit cache
 * locality on replicas
 */
public class RepGroupState {

    private final RepGroupId repGroupId;

    private final ResourceId trackerId;

    private final boolean async;

    private final Logger logger;

    /**
     * The master of the group, or null if the master is unknown. This iv,
     * along with the lastChange iv below, is directly derived from the
     * information in a StateChangeEvent that either originated at this node,
     * due to as Listener invocation, or due to a StateChangeEvent at a distant
     * node that was communicated to this node as part of a Response.
     */
    private volatile RepNodeId groupMasterId;

    /**
     * Represents the time of the last change in the MASTER/REPLICA rep state
     * associated with the group. It's used to determine whether an update is
     * newer than the current known state.
     */
    private long lastChange;

    /**
     * The states for RNs that are members of the group. The collection starts
     * out empty and is filled in via calls to the update methods.
     */
    private final Map rns;

    /**
     * Map of metadata sequence numbers. The sequence numbers is the last known
     * value. No entry will be present if there is no information for a given
     * metadata type.
     */
    private final Map seqNums =
                    Collections.synchronizedMap(
                        new EnumMap(MetadataType.class));

    /**
     * Coordinates access to the RepGroupState.
     */
    private final ReadWriteLock lock = new ReentrantReadWriteLock();

    /**
     *  Used to select a random RN from within a rep group
     */
    static volatile int randomRN;

    public RepGroupState(RepGroupId repGroupId,
                         ResourceId trackerId,
                         boolean async,
                         Logger logger) {
        this.repGroupId = repGroupId;
        this.trackerId = trackerId;
        this.async = async;
        this.logger = checkNull("logger", logger);
        rns = new ConcurrentHashMap(3);
    }

    public RepGroupId getResourceId() {
       return repGroupId;
    }

    /**
     * Returns the RN states associated with the RG. The returned collection
     * may be empty if the group has not yet been initialized with Topology
     * information.
     *
     * Note: this method is intended for testing. It provides unsynchronized
     * internal representation state.
     */
    public Collection getRepNodeStates() {
        return rns.values();
    }

    /**
     * Returns the node state associated with the rnId.
     */
    public RepNodeState get(RepNodeId rnId) {

        assert rnId.getGroupId() == repGroupId.getGroupId();
        try {
            lock.readLock().lock();
            final RepNodeState repNodeState = rns.get(rnId);
            if (repNodeState != null) {
                return repNodeState;
            }
        } finally {
            lock.readLock().unlock();
        }

        try {
            lock.writeLock().lock();
            RepNodeState repNodeState = rns.get(rnId);
            if (repNodeState == null) {
                repNodeState =
                    new RepNodeState(rnId, trackerId, async, logger);
                rns.put(rnId, repNodeState);
            }
            return repNodeState;
        } finally {
            lock.writeLock().unlock();
        }
    }

    /**
     * Returns the node state for the master associated with the RepGroup.
     *
     * @return the RepNodeState for the master if it knows of one. Or null if
     * it does not know of one.
     */
    public RepNodeState getMaster() {
        final RepNodeId stableGroupMasterId = groupMasterId;
        return (stableGroupMasterId == null) ? null : get(stableGroupMasterId);
    }

    /**
     * Updates the group state with information from the topology. If the
     * topology contains information about rns that it does not currently
     * contain, it updates its information. Updating the information means
     * that it will start using the new RNs during load balancing for requests.
     */
    public void update(RepGroup rg, Topology topology) {
        try {
            lock.writeLock().lock();
            final Collection shardRNs = rg.getRepNodes();
            for (final RepNode rn : shardRNs) {
                final RepNodeId rnId = rn.getResourceId();
                RepNodeState rnState = rns.get(rnId);
                if (rnState == null) {
                    rnState = new RepNodeState(rnId, trackerId, async, logger);
                    rns.put(rnId, rnState);
                }
                final StorageNode sn = topology.get(rn.getStorageNodeId());
                rnState.setZoneId(sn.getDatacenterId());
            }

            /* Remove any RNs that no longer appear in the shard */
            if (shardRNs.size() < rns.size()) {
                final Iterator iter = rns.keySet().iterator();
                while (iter.hasNext()) {
                    final RepNodeId rnId = iter.next();
                    if (rg.get(rnId) == null) {
                        iter.remove();
                    }
                }
            }
        } finally {
            lock.writeLock().unlock();
        }
    }

    /**
     * Updates the HA state of the node, and possibly other nodes in the group.
     * 
     * If the event indicates there is a new master, the state of the nodes in
     * the group is updated to indicate that it's the sole new master. If some
     * node was previously in the master state, it's changed to being in the
     * Replica state.
     * 

     * Note that the update relies on time associated with change events, to
     * ensure that older updates that were delayed do not overwrite newer
     * updates. The use of time is essential for tracking the authoritative
     * master during a network partition. The non-authoritative master will
     * always have an older time associated with its state change event and
     * will therefore be superseded by the authoritative master which will have
     * a later time associated with its state change event, since its election
     * was initiated after the intervening network partition.
     *
     * @param rnId the node whose state is to be updated
     * @param masterId the master if its known or null
     * @param state the new HA state associated with rnId
     * @param time the time used to sequence the change
     */
    public void update(RepNodeId rnId,
                       RepNodeId masterId,
                       State state,
                       long time) {

        try {
            lock.writeLock().lock();
            if (time <= lastChange) {

                /*
                 * An out of sequence change, or while we are relying on a time
                 * sequence a clock skew.
                 */
                return;
            }

            if (!state.isActive()) {
                /* A localized state change to UNKNOWN, DETACHED, etc. */
                assert masterId == null;
                final RepNodeState rnState = get(rnId);
                rnState.updateRepState(state);
                return;
            }

            /*
             * Only update the time as a result of information from active
             * nodes which have information about the master state. Transitions
             * to a non active state do not update time, we wait for nodes
             * transitioning to active states to provide us with the correct
             * master when it becomes available; the information is slightly
             * inaccurate during any interregnum and request forwarding/retry
             * will deal with it.
             */
            lastChange = time;

            assert State.MASTER.equals(state) ? rnId.equals(masterId) : true;
            assert State.REPLICA.equals(state) ?
                   ((masterId !=  null) && !rnId.equals(masterId)) : true;

            masterId = State.MASTER.equals(state) ? rnId : masterId;
            groupMasterId = masterId;

            for (RepNodeState rnState : rns.values()) {
                if (rnState.getRepNodeId().equals(rnId)) {
                    rnState.updateRepState(state);
                    continue;
                } else if (rnState.getRepNodeId().equals(masterId)) {
                    /* The new master. */
                    rnState.updateRepState(State.MASTER);
                    continue;
                }

                /*
                 * Some other node, if it was a MASTER, change its state to a
                 * REPLICA, since we know it's no longer a master.
                 */
                if (rnState.getRepState().isMaster()) {
                    rnState.updateRepState(State.REPLICA);
                }
            }
        } finally {
            lock.writeLock().unlock();
        }
    }

    /**
     * Returns the load balanced RepNode for a request.
     *
     * @param request the request being dispatched
     * @param excludeRNs the set of RNs that must be excluded
     *
     * @return the load balanced active (master or replica) RN or null
     */
    public RepNodeState getLoadBalancedRN(Request request,
                                          Set excludeRNs) {
        try {
            lock.readLock().lock();
            return getLeastBusyRN(request, excludeRNs);
        } finally {
            lock.readLock().unlock();
        }
    }

    /**
     * Returns the RN that is currently least busy as determined by the number
     * of active connections.
     * 

     * This rather simple strategy turns out to yield superior throughput
     * performance when compared to the strategy embodied in
     * {@link #getBestRespTimeRN(Set)}. This strategy provides a throughput
     * improvement of 25% on workloadA of the YCSB benchmark, effectively
     * saturating the disk with random IOs. One possible explanation is that
     * response times necessarily represent past history, while the number of
     * currently outstanding requests is a more current measure of the expected
     * response time from the node. This permits the dispatcher to react faster
     * to instantaneous changes on an RN or the network path. It's worth noting
     * that the number of open active connections is an indirect indicator of
     * response times. An RN that serves requests more promptly is more likely
     * to have fewer active connections and will therefore be chosen more often
     * as the target of a request dispatch.
     * 

     * This strategy may result in even activity distribution but unequal loads
     * across the RNs, if each operation consumes a different amount of
     * resources versus instead of a constant amount. For example, a single key
     * lookup versus a range lookup. In this case a more sophisticated strategy
     * that "weights" each active connection based upon its type could result
     * in a more even distribution. Thus using a single key lookup as the basic
     * unit operation, an active connection performing a "range" query would be
     * weighted by a factor of say three. More sophisticated techniques could
     * be used to weight the connections dynamically based upon the response
     * time associated with a specific operation type. For now, we favor
     * simplicity and treat all operation types as consuming the same resources.
     *
     * @param request the request that is to be dispatched to the RN
     *
     * @param excludeRNs the set of RNS to be excluded from consideration.
     *
     * @return the least busy active (master or replica) RN or null if there is
     * no clear choice, usually due to insufficient or out of date state
     * information
     */
    private RepNodeState getLeastBusyRN(Request request,
                                        Set excludeRNs) {

        long minRespTime = Integer.MAX_VALUE;
        RepNodeState minRN = null;
        int minActiveRequestCount = Integer.MAX_VALUE;

        for (RepNodeState rn : rns.values()) {
            if (((excludeRNs != null) &&
                  excludeRNs.contains(rn.getRepNodeId())) ||
                  rn.reqHandlerNeedsRepair() ||
                 !rn.getRepState().isActive()) {
                continue;
            }

            if (!request.isPermittedZone(rn.getZoneId())) {
                continue;
            }

            final Consistency consistency = request.getConsistency();

            if (!inConsistencyRange(rn, consistency)) {

                /*
                 * Filter out nodes that are laggards and are unlikely to
                 * satisfy consistency requirements
                 */
                continue;
            }

            final int avRespTimeMs = rn.getAvReadRespTimeMs();
            final int activeRequestCount = rn.getActiveRequestCount();

            if ((activeRequestCount > minActiveRequestCount) ||
                ((activeRequestCount == minActiveRequestCount) &&
                 (avRespTimeMs > minRespTime))) {
                 /* Use resp time as a tie breaker */
                continue;
            }

            /* A new least busy RN */
            minRN = rn;
            minRespTime = avRespTimeMs;
            minActiveRequestCount = activeRequestCount;
        }
        return minRN;
    }

    /**
     * Returns true if the rn looks like it's able to satisfy any consistency
     * requirements associated with the request
     *
     * @param rn the RN being evaluated
     *
     * @param consistency the consistency requirement that must be satisfied by
     * the RN
     */
    @SuppressWarnings("deprecation")
    private boolean inConsistencyRange(RepNodeState rn,
                                       Consistency consistency) {

        if (consistency == null) {
            /* No consistency requirements to be satisfied. */
            return true;
        }
        if (consistency == Consistency.NONE_REQUIRED) {
            /* Any accessible node can satisfy this consistency. */
            return true;
        }

        if (consistency == Consistency.NONE_REQUIRED_NO_MASTER) {
            /* Only a replica can satisfy this consistency. */
            return !(rn.getRepNodeId().equals(groupMasterId));
        }

        if (consistency == Consistency.ABSOLUTE) {
            /* Only the master can satisfy this consistency. */
            return rn.getRepNodeId().equals(groupMasterId);
        }

        if (consistency instanceof Version) {
            final Version vConsistency = (Version) consistency;
            return rn.inConsistencyRange(System.currentTimeMillis(),
                                         vConsistency);
        }

        if (consistency instanceof Time) {
            final Time tConsistency = (Time) consistency;
            return rn.inConsistencyRange(System.currentTimeMillis(),
                                         tConsistency, getMaster());
        }

        throw new IllegalStateException("Unexpected consistency: " +
                                        consistency);
    }

    /**
     * Returns the RN that has the best responsive time, but has not exceeded
     * an activity threshold. It does this by prorating the allowed number of
     * active requests based upon relative responsive times. The lower the av
     * resp times at the node, the higher the proportion of activity directed
     * towards it. The average response time and therefore the target activity
     * percent is being continuously re-adjusted as a result of each response
     * from a node.
     * 

     * It's important to ensure that at least some minimal set of requests is
     * always directed at a node in order to maintain a reasonably current
     * average response time.
     * 

     * NOTE: We do not use this strategy currently, preferring the one that's
     * currently embodied in {@link #getLeastBusyRN(Set)} instead. The
     * reasons for this choice are embedded in the method's comments. The
     * method is retained here in case we find circumstances in future where
     * this strategy proves advantageous.
     *
     * @param excludeRNs the set of RNS to be excluded from consideration.
     *
     * @return the RN with the best response times
     */
    @SuppressWarnings("unused")
    private RepNodeState getBestRespTimeRN(Set excludeRNs) {
        final RepNodeState[] activeRNs = new RepNodeState[rns.size()];
        int sumAvRespTimesMs = 0;
        int totalActivity = 0;

        int index = 0;
        for (RepNodeState rn : rns.values()) {
            if (((excludeRNs != null) &&
                 excludeRNs.contains(rn.getRepNodeId())) ||
                !rn.getRepState().isActive()) {
                continue;
            }
            activeRNs[index++] = rn;
            sumAvRespTimesMs += rn.getAvReadRespTimeMs();
            totalActivity += rn.getActiveRequestCount();
        }

        /*
         * Try twice, first with thresholds in place and again, without
         * them.
         */
        for (boolean checkThreshold : new boolean[]{true, false}) {

            long minRespTime = Integer.MAX_VALUE;
            RepNodeState minRN = null;
            int minActiveRequestCount = Integer.MAX_VALUE;

            for (RepNodeState rn : activeRNs) {
                if (rn == null) {
                    break;
                }

                final int avRespTimeMs = rn.getAvReadRespTimeMs();
                final int activeRequestCount = rn.getActiveRequestCount();

                /*
                 * Response time is the principal consideration. Use
                 * current activity as a tie breaker. That is, direct
                 * direct request to the nodes that's less busy.
                 */
                if ((minRespTime <  avRespTimeMs) ||
                    ((minRespTime == avRespTimeMs) &&
                     (activeRequestCount > minActiveRequestCount))) {
                    continue;
                }

                /*
                 * Ensure that requests are distributed in proportion to
                 * their load.
                 */
                if (checkThreshold &&
                    !belowActivityThreshold(avRespTimeMs,
                                            sumAvRespTimesMs,
                                            activeRequestCount,
                                            totalActivity)) {
                    continue;
                }

                minRN = rn;
                minRespTime = avRespTimeMs;
                minActiveRequestCount = activeRequestCount;
            }

            if (minRN != null) {
                return minRN;
            }

            /* Try again without accounting for activity thresholds. */
        }
        return null;
    }

    /**
     *
     * Returns true if the activity at that node is below the current activity
     * threshold. It does this by prorating the allowed number of active
     * requests based upon relative responsive times. The lower the av resp
     * times at the node, the higher the proportion of activity directed
     * towards it. The average response time and therefore the target activity
     * percent is being continuously re-adjusted as a result of each response
     * from a node.
     * 

     * It's important to ensure that at least some minimal set of requests is
     * always directed at a node in order to maintain a reasonably current av
     * response time.
     * 
     *
     * @param avRespTimeMs
     * @param sumAvRespTimesMs
     * @param activeRequestCount
     * @param totalActivity
     */
    private boolean belowActivityThreshold(int avRespTimeMs,
                                           int sumAvRespTimesMs,
                                           int activeRequestCount,
                                           int totalActivity) {

        if ((totalActivity == 0) || (sumAvRespTimesMs == 0)) {
            /* No current activity, or < 1 ms resp times. */
            return true;
        }

        /* The lower the av resp times, the larger the target activity */
        final int targetActivityPercent =
            ((sumAvRespTimesMs - avRespTimeMs) * 100) / sumAvRespTimesMs;

        return ((activeRequestCount * 100) / totalActivity) <
            targetActivityPercent;
    }

    /**
     * Returns a random RN from within the group. This method is used as a
     * fallback when no state information is available for any of the nodes in
     * the RG. It represents a "last ditch" attempt to pick some RN in the
     * rep group to which a request can be dispatched. If no RNs are in the
     * group, or all are excluded, null is returned.
     *
     * @param request the request being dispatched, or null for a NOP request
     * @param excludeRNs the set of RNs that must be excluded
     * @return a randomly chosen RN or null
     */
    public RepNodeState getRandomRN(Request request,
                                    Set excludeRNs) {

        if (rns.isEmpty()) {
            return null;
        }

        try {
            lock.readLock().lock();
            int rnIndex = randomRN++ % rns.size();
            for (RepNodeState rn : rns.values()) {
                rnIndex--;
                if ((excludeRNs != null) &&
                    excludeRNs.contains(rn.getRepNodeId())) {
                    continue;
                }
                if ((request != null) &&
                    !request.isPermittedZone(rn.getZoneId())) {
                    continue;
                }
                if (rnIndex < 0) {
                    return rn;
                }
            }
            return null;
        } finally {
            lock.readLock().unlock();
        }
    }
    
    /**
     * Returns the current seq number associated with the node for the specified
     * metadata. The value Metadata.UNKNOWN_SEQ_NUM is returned if the sequence
     * number is not known.
     * 
     * @param type a metadata type
     * @return the current sequence number
     */
    public int getSeqNum(MetadataType type) {
        assert type != MetadataType.TOPOLOGY;
        synchronized (seqNums) {
            final Integer seqNum = seqNums.get(type);
            return seqNum == null ? Metadata.UNKNOWN_SEQ_NUM : seqNum;
        }
    }

    /**
     * Updates the sequence number of the specified metadata, moving it forward.
     * If the new sequence number is less than or equal to the current sequence
     * number the current sequence number is unchanged.
     *
     * @param type a metadata type
     * @param newSeqNum the new sequence number
     * @return the current sequence number
     */
    public int updateSeqNum(MetadataType type, int newSeqNum) {
        assert type != MetadataType.TOPOLOGY;
        synchronized (seqNums) {
            int oldSeqNum = getSeqNum(type);
            if (oldSeqNum < newSeqNum) {
                seqNums.put(type, newSeqNum);
                return newSeqNum;
            }
            return oldSeqNum;
        }
    }
    
    @Override
    public String toString() {
        return "RepGroupState[" + repGroupId + ", " + groupMasterId + "]";
    }
}