oracle.kv.impl.api.TopologyManager Maven / Gradle / Ivy
/*-
* Copyright (C) 2011, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This file was distributed by Oracle as part of a version of Oracle NoSQL
* Database made available at:
*
* http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
*
* Please see the LICENSE file included in the top-level directory of the
* appropriate version of Oracle NoSQL Database for a copy of the license and
* additional information.
*/
package oracle.kv.impl.api;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.WeakHashMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import oracle.kv.StaleStoreHandleException;
import oracle.kv.impl.fault.OperationFaultException;
import oracle.kv.impl.fault.WrappedClientException;
import oracle.kv.impl.security.InvalidSignatureException;
import oracle.kv.impl.topo.Partition;
import oracle.kv.impl.topo.PartitionId;
import oracle.kv.impl.topo.RepGroupId;
import oracle.kv.impl.topo.Topology;
import oracle.kv.impl.topo.change.TopologyChange;
/**
* Coordinates access to the in-memory copy of the Topology. Saving the
* Topology in an environment is done, if needed, by the RepNode itself.
*
* It makes provisions for registering pre and post update listeners that are
* invoked whenever the topology is changed. It's worth noting that there are
* three sets of callbacks that are executed in the following sequence:
* PreUpdateListener callbacks, Localizer callbacks, PostUpdateListener
* callbacks.
*
* Note that some of the methods relating to the persistent management of
* topology are in RepNode rather than in this class where they would appear to
* belong logically. This is to ensure that this shared class which is used
* both by KV clients and RNs does not contain references to JE classes.
*/
public class TopologyManager {
/**
* The name of the kvstore
*/
private final String kvsName;
/* The current in-memory copy of the Topology. */
private volatile Topology topology;
/**
* The local topology. The local topology can only ever differ from the
* in-memory copy when the manager is running on the RepNode. In this case
* the local topology may contain modifications to the "official"
* topology due to partition migration activity. The local topology must
* only be used to direct client operations and must NEVER be sent to
* another node.
*/
private Topology localTopology;
private Localizer localizer = null;
/**
* The listeners to be invoked before proceeding with a Topology update.
* Access must be synchronized on the manager instance.
*/
private final List preUpdateListeners =
new LinkedList();
/**
* The listeners to be invoked after a Topology update. Access must be
* synchronized on the manager instance. If the listener is held weakly
* then value is null, which allows the reference to be gc'ed, otherwise
* value is the listener keeping a strong reference on the listener.
* WeakHashMap key references are weak, while the value references are
* strong.
*/
private final Map
postUpdateListeners = new WeakHashMap();
/**
* The number of topology changes to be retained when managing the
* topology.
*/
private final int maxTopoChanges;
private final Logger logger;
/**
* The constructor. Note that the manager starts out with a null Topology.
* It's first initialized with a call to {@link #update}
*
* @param kvsName the name of the store
* @param maxTopoChanges the max number of changes to be retained
* @param logger a logger
*/
public TopologyManager(String kvsName,
int maxTopoChanges,
Logger logger) {
this.kvsName = kvsName;
this.maxTopoChanges = maxTopoChanges;
this.logger = logger;
}
/**
* Adds a pre update listener to help track Topology changes. The primary
* purpose of the pre listener is to permit topology validation before
* updating to a new topology.
*
* @param listener the new listener
*/
public synchronized void addPreUpdateListener(PreUpdateListener listener) {
if (!preUpdateListeners.contains(listener)) {
preUpdateListeners.add(listener);
}
}
/**
* Adds a post update listener to help track Topology changes. All
* components that are dependent upon the Topology should register a
* listener, so they can be kept informed whenever the Topology changes.
*
* @param listener the new listener
*/
public void addPostUpdateListener(PostUpdateListener listener) {
addPostUpdateListener(listener, false);
}
/**
* Adds a post update listener to help track Topology changes. All
* components that are dependent upon the Topology should register a
* listener, so they can be kept informed whenever the Topology changes.
* If weak is true the listener is maintained with a weak reference allowing
* it to get GCed when the caller is done with it.
*
* @param listener the new listener
*/
public synchronized void addPostUpdateListener(PostUpdateListener listener,
boolean weak) {
if (!postUpdateListeners.containsKey(listener)) {
/*
* If weak, set the value to null so that the listner can be gc'ed.
* Otherwise keep a hard reference to the listener via the value.
*/
postUpdateListeners.put(listener, weak ? null : listener);
}
}
/**
* Removes the specified post update listener. This method should not be
* invoked from PostUpdateListener.postUpdate().
*
* @param listener the listener to remove
*/
public synchronized void
removePostUpdateListener(PostUpdateListener listener) {
postUpdateListeners.remove(listener);
}
/**
* Invoke the registered pre update listeners. These listeners are invoked
* before the "official" topology is updated.
*/
private void invokePreUpdateListeners(Topology newTopology) {
assert Thread.holdsLock(this);
/* Inform the listeners. */
for (PreUpdateListener l : preUpdateListeners) {
l.preUpdate(newTopology);
}
}
/**
* Invoke the registered post update listeners. These listeners are invoked
* after either the "official" or local topology has been updated.
*/
private void invokePostUpdateListeners() {
assert Thread.holdsLock(this);
/* Inform the listeners. */
final Iterator itr =
postUpdateListeners.keySet().iterator();
StringBuilder excStrBuilder = new StringBuilder();
while (itr.hasNext()) {
PostUpdateListener listener = itr.next();
try {
if (listener.postUpdate(topology)) {
itr.remove();
}
} catch (OperationFaultException e) {
if (excStrBuilder.length() == 0) {
excStrBuilder.append("Some topology post updates failed: ");
} else {
excStrBuilder.append(", ");
}
excStrBuilder.
append(listener).append(":").
append("(").append(e.getMessage()).append(")");
}
}
if (excStrBuilder.length() != 0) {
throw new OperationFaultException(excStrBuilder.toString());
}
}
/**
* Sets the localizer object for this manager. The localizer's
* localizeTopology() method will be invoked when the topology is
* updated.
*
* @param localizer
*/
public void setLocalizer(Localizer localizer) {
this.localizer = localizer;
}
public Topology getTopology() {
return topology;
}
/**
* Returns the local topology for this node. This should only be used to
* direct client requests. The returned topology must NEVER be sent to
* another node.
*
* @return the local topology
*/
public Topology getLocalTopology() {
return (localTopology == null) ? topology : localTopology;
}
/**
* For use by unit tests only.
*/
public void setLocalTopology(Topology localTopology) {
this.localTopology = localTopology;
}
/**
* Updates the Topology by replacing the entire Topology with a new
* instance. This is typically done in response to a request from the SNA.
* Or if the topology cannot be update incrementally because the
* necessary sequence of changes is not available in incremental form.
*
* The update is only done if the Topology is not current. If the Topology
* needed to be updated, but the update failed false is returned. Otherwise
* true is returned.
*
* @param newTopology the new Topology
*
* @return false if the update failed
*/
public synchronized boolean update(Topology newTopology) {
final int currSeqNum;
if (topology != null) {
if (!kvsName.equals(topology.getKVStoreName())) {
throw new IllegalArgumentException
("Update topology associated with KVStore: " +
topology.getKVStoreName() + " expected: " + kvsName);
}
checkTopologyId(topology.getId(), newTopology.getId());
currSeqNum = topology.getSequenceNumber();
} else {
currSeqNum = 0;
}
final int newSequenceNumber = newTopology.getSequenceNumber();
if (currSeqNum >= newSequenceNumber) {
logger.log(Level.INFO,
"Topology update skipped. " +
"Current seq #: {0} Update seq #: {1}",
new Object[]{currSeqNum, newSequenceNumber});
return true;
}
checkVersion(logger, newTopology);
/*
* Pre-updater may verify the signature of new topology copy. If the
* verification failed, don't continue with the update;
*/
try {
invokePreUpdateListeners(newTopology);
} catch (InvalidSignatureException ise) {
logger.log(Level.INFO,
"Topology udpate to seq# {0} skipped due to " +
"invalid signature.",
newSequenceNumber);
return false;
}
/*
* If updating the local topology fails don't continue with the update.
*/
if (!updateLocalTopology(newTopology)) {
return false;
}
logger.log(Level.INFO, "Topology updated from seq#: {0} to {1}",
new Object[]{currSeqNum, newSequenceNumber});
topology = newTopology.pruneChanges(Integer.MAX_VALUE, maxTopoChanges);
/*
* Inform components that are dependent upon the Topology, so they
* can fix their internal state.
*/
invokePostUpdateListeners();
return true;
}
/**
* Ensures that any changes in partition assignment at an RN can be
* explained by elasticity operations that are in progress. This
* verification relies on use of an absolutely consistent local topology
* which is only available at the master, so the check is only done on the
* master. It's the caller's responsibility to ensure that the method is
* only invoked on the master. The call is currently accomplished via the
* PreUpdateListener registered by the RepNode which has access to the
* replicated environment handle and can determine the HA state and
* decide whether the call should be made.
*
* @param rgId the replication group associated with the checks
*
* @param newTopo the new topology that is being checked
*
* @throws IllegalStateException if the partition checks fail
*/
public void checkPartitionChanges(RepGroupId rgId,
Topology newTopo)
throws IllegalStateException {
if ((topology == null) || (topology.getPartitionMap().size() == 0)) {
return;
}
final Set currentPartitions =
getRGPartitions(rgId, topology);
final Set newPartitions = getRGPartitions(rgId, newTopo);
for (PartitionId npId : newPartitions) {
final Partition np = newTopo.get(npId);
final Partition cp = topology.get(npId);
if (np.getRepGroupId().equals(cp.getRepGroupId())) {
/*
* Has the same RG in old and new topos. Ignore the local
* topology, it can contain entries for partitions that are
* still in flight.
*/
currentPartitions.remove(npId);
continue;
}
/* Old/new mismatch account for it via local topology. */
final Partition lp = (localTopology != null) ?
localTopology.get(npId) : null;
/*
* A new partition associated with the RG. It should be compatible
* with the definition in the local topology.
*/
if (lp == null) {
/*
* There cannot be a difference if no migration is in
* progress.
*/
final String msg =
String.format("%s in the new topology(seq #: %,d) " +
"is absent from this shard in the current " +
"topology(seq #: %,d) and there is no " +
"partition migration in progress.",
np, newTopo.getSequenceNumber(),
topology.getSequenceNumber());
throw new IllegalStateException(msg);
}
/*
* Check whether the partition has been migrated and is therefore
* in the local topology.
*/
if (lp.getRepGroupId().equals(np.getRepGroupId())) {
/*
* If it's in the new topology, its RG must exist with that RG
* in the local topology
*/
continue;
}
/* Disagreement on which RG the partition should be in. */
final String msg =
String.format("%s in the new topology(seq #: %,d) and %s " +
"in the local topology(internal seq#: %,d) " +
"are associated with different shards",
np, newTopo.getSequenceNumber(),
lp, localTopology.getSequenceNumber());
throw new IllegalStateException(msg);
}
/*
* Any residual current partitions (after the removal of matching
* partitions above) should represent partitions that were migrated
* away from this migration source. They must be in the local topology,
* their definition in the localTopology may not agree with the
* definition in the new topology, since they may be in the process of
* being migrated.
*/
for (PartitionId cpId : currentPartitions) {
final Partition cp = topology.get(cpId);
final Partition lp = (localTopology != null) ?
localTopology.get(cpId) : null;
if (lp == null) {
/*
* There cannot be a difference if no migration is in
* progress.
*/
final String msg =
String.format("%s is in the current topology(seq #: %,d)" +
" but is absent from the new topology" +
"(seq #: %,d) and there is no " +
"partition migration in progress.",
cp, topology.getSequenceNumber(),
newTopo.getSequenceNumber());
throw new IllegalStateException(msg);
}
/*
* Check whether the partition has been migrated away and is
* therefore in the local topology associated with a different RG
*/
if (!lp.getRepGroupId().equals(cp.getRepGroupId())) {
/*
* A partition that was migrated out of this group. Note that
* we are not actually checking whether the migration has
* completed to keep things simple.
*/
continue;
}
/*
* Partition is present in the local topology with the same RG as
* the current topo. Disagreement on RGs between current and new
* topo that cannot be justified by the local topo.
*/
final String msg =
String.format("%s is associated with the same shard in both " +
"the current(seq #: %,d) and local topologies " +
"but is associated with a different shard %s " +
"in the new topology(seq#: %,d). ",
cp,
topology.getSequenceNumber(),
newTopo.get(cpId).getRepGroupId(),
newTopo.getSequenceNumber());
throw new IllegalStateException(msg);
}
}
/**
* A utility method to retrieve all the partitions associated with an RG
*
* @param rgId identifies the filtering RG
*
* @param topo the topology containing the partitions
*
* @return the partition ids of the partitions hosted by the RG
*/
private Set getRGPartitions(RepGroupId rgId,
Topology topo) {
final Set hostedPartitions =
new HashSet(100);
for (Partition p : topo.getPartitionMap().getAll()) {
if (!p.getRepGroupId().equals(rgId)) {
continue;
}
hostedPartitions.add(p.getResourceId());
}
return hostedPartitions;
}
/*
* Checks the topology version to make sure its acceptable. The version
* should typically have been upgraded to the current version as a
* consequence of deserialization.
*/
public static void checkVersion(Logger logger,
Topology topology) {
final int topoVersion = topology.getVersion();
if (topoVersion == Topology.CURRENT_VERSION) {
return; /* All's well, keep going. */
}
if (topoVersion == 0) {
/*
* r1 topology, inconsistent distribution of RNs across DCs. Warn
* and keep going.
*/
logger.warning("Using r1 topology, it was not upgraded.");
} else {
/* Should not happen. */
throw new OperationFaultException
("Encountered topology with version: " + topoVersion +
" Current topology version: " + Topology.CURRENT_VERSION);
}
}
/**
* Performs an incremental update to the Topology.
*
* The update is sometimes done in the request/response loop, but it would
* be better if the update was done asynchronously so as not to impact
* request latency. We need an async version of the update operation for
* this purpose. Not a pressing issue, since Topology updates are
* infrequent.
*
* An update may result in the topology changes being pruned so that only
* the configured number of changes are retained.
*
* This method has package access for unit test
*
* A contract of this method is that, the topology Id, the change
* information and the signature should come from the same topology
* instance.
*
* @param topologyId the topology id associated with the changes
* @param changes the changes to be made to the current copy of the
* Topology
* @param topoSignature the signature of the topology where the changes
* originated.
*/
synchronized void update(long topologyId,
List changes,
byte[] topoSignature) {
/*
* The topology can be null if the node was were waiting for a topo
* push from another node, e.g. during replica start up.
*/
final Topology workingCopy = (topology == null) ?
new Topology(kvsName, topologyId) : topology.getCopy();
checkTopologyId(workingCopy.getId(), topologyId);
final int prevSequenceNumber = workingCopy.getSequenceNumber();
if (!workingCopy.apply(changes)) {
/* Topology not changed */
return;
}
workingCopy.updateSignature(topoSignature);
/*
* Pre-updater may verify the signature of new topology copy. If the
* verification fails, don't continue with the update;
*/
try {
invokePreUpdateListeners(workingCopy);
} catch (InvalidSignatureException ise) {
logger.log(Level.INFO,
"Topology incremental update to seq# {0} skipped " +
"due to invalid signature.",
workingCopy.getSequenceNumber());
return;
}
if (!updateLocalTopology(workingCopy)) {
return;
}
/* Make an atomic change. */
topology = workingCopy.pruneChanges(changes.get(0).getSequenceNumber(),
maxTopoChanges);
logger.log(Level.INFO,
"Topology incrementally updated from seq#: {0} to {1}",
new Object[]{prevSequenceNumber,
topology.getSequenceNumber()});
invokePostUpdateListeners();
}
public synchronized void update(TopologyInfo topoInfo) {
update(topoInfo.getTopoId(), topoInfo.getChanges(),
topoInfo.getTopoSignature());
}
/**
* Verifies that the remote topology being used to update the local
* topology is compatible with it. All pre r2 topologies or r2 topologies
* that are communicated by r1 clients have the topology id zero. They
* are assumed to match non-zero r2 topologies for compatibility.
*
* @param localTopoId the local topology id
* @param remoteTopoId the remote topology id
*/
private void checkTopologyId(long localTopoId,
long remoteTopoId) {
if (localTopoId == remoteTopoId) {
return;
}
// TODO: Remove if we decide not to support r1 clients with r2 RNs
if ((localTopoId == Topology.NOCHECK_TOPOLOGY_ID) ||
(remoteTopoId == Topology.NOCHECK_TOPOLOGY_ID)) {
return;
}
final String msg = "Inconsistent use of Topology. " +
"An attempt was made to update this topology created on " +
new Date(localTopoId) +
" with changes originating from a different topology created on " +
new Date(remoteTopoId) +
". This exception indicates an application configuration issue." +
" Check if this store handle belongs to an older, now defunct " +
"store.";
/*
* Note that we intentionally throw an operation fault exception,
* rather than IllegalStateException. The latter is a catastrophic
* exception that shuts down the RN process. This led to the bug
* described in SR [#24693], where connection attempts from old clients
* make an RN repeatedly throw IllegalStateException, which ultimately
* brings the RN down! Clients should never be able to create
* server side failure like that, so this has been changed to
* StoreStaleHandleException, so the client knows it has to close and
* reopen its handle.
*/
throw new WrappedClientException(new StaleStoreHandleException(msg));
}
/**
* Updates the local topology if possible. Returns true if the local
* topology was updated otherwise false. If the local topology is updated
* the listeners are invoked.
*
* @return true if local topology was updated
*/
public synchronized boolean updateLocalTopology() {
/*
* Special case of the topology not yet being initialized. In this case
* report that things are OK, but don't invoke the listeners.
*/
if (topology == null) {
return true;
}
if (!updateLocalTopology(topology)) {
return false;
}
invokePostUpdateListeners();
return true;
}
/**
* Updates the local topology if possible. Returns true if the local
* topology was updated otherwise false.
*
* @param newTopology the topology to localize
* @return true if localTopology was updated
*/
private boolean updateLocalTopology(Topology newTopology) {
if (localizer == null) {
return true;
}
final Topology local = localizer.localizeTopology(newTopology);
if (local == null) {
logger.log(Level.INFO, "Topology update to {0} skipped. " +
"Unable to update local topology.",
newTopology.getSequenceNumber());
return false;
}
localTopology = local;
return true;
}
/**
* Returns true if the partition is in the process
* of moving (changing groups) or has moved.
*/
public boolean inTransit(PartitionId partitionId) {
if (partitionId.isNull()) {
return false;
}
final RepGroupId localGroupId =
getLocalTopology().getRepGroupId(partitionId);
final RepGroupId currentGroupId =
getTopology().getRepGroupId(partitionId);
if ((localGroupId == null) || (currentGroupId == null)) {
return false;
}
/*
* If the local group has changed, then the partition is in transit.
*/
return localGroupId.getGroupId() != currentGroupId.getGroupId();
}
public interface PostUpdateListener {
/**
* The update method is invoked after either the "official" or "local"
* topology has been updated. Implementations must take care to avoid
* deadlocks as the topology manager instance will be locked at the
* time of the call to postUpdate().
*
* @return true if the listener is no longer needed and can be removed
* from the list
*/
boolean postUpdate(Topology topology);
}
public interface PreUpdateListener {
/**
* The update method is invoked before the "official" topology is
* updated. Exceptions resulting from the listener will abort the
* topology update operation. Implementations must take care to avoid
* deadlocks as the topology manager instance will be locked at the
* time of the call to preUpdate().
*/
void preUpdate(Topology topology);
}
public interface Localizer {
/**
* Localizes the specified topology. The localized topology is returned.
* The return value may be the input topology if no changes are made.
* Null is returned if it was not possible to localize the topology.
*
* @param topology the topology to localize
* @return a localized topology or null
*/
Topology localizeTopology(Topology topology);
}
}