All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.infinispan.statetransfer.StateConsumerImpl Maven / Gradle / Ivy

package org.infinispan.statetransfer;

import static org.infinispan.context.Flag.CACHE_MODE_LOCAL;
import static org.infinispan.context.Flag.IGNORE_RETURN_VALUES;
import static org.infinispan.context.Flag.PUT_FOR_STATE_TRANSFER;
import static org.infinispan.context.Flag.SKIP_LOCKING;
import static org.infinispan.context.Flag.SKIP_OWNERSHIP_CHECK;
import static org.infinispan.context.Flag.SKIP_REMOTE_LOOKUP;
import static org.infinispan.context.Flag.SKIP_SHARED_CACHE_STORE;
import static org.infinispan.context.Flag.SKIP_XSITE_BACKUP;
import static org.infinispan.factories.KnownComponentNames.STATE_TRANSFER_EXECUTOR;
import static org.infinispan.persistence.manager.PersistenceManager.AccessMode.PRIVATE;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;

import javax.transaction.SystemException;
import javax.transaction.TransactionManager;

import org.infinispan.Cache;
import org.infinispan.commands.CommandsFactory;
import org.infinispan.commands.write.InvalidateCommand;
import org.infinispan.commands.write.PutKeyValueCommand;
import org.infinispan.commons.CacheException;
import org.infinispan.commons.util.EnumUtil;
import org.infinispan.commons.util.SmallIntSet;
import org.infinispan.commons.util.concurrent.ConcurrentHashSet;
import org.infinispan.configuration.cache.CacheMode;
import org.infinispan.configuration.cache.Configuration;
import org.infinispan.container.DataContainer;
import org.infinispan.container.entries.InternalCacheEntry;
import org.infinispan.context.InvocationContext;
import org.infinispan.context.InvocationContextFactory;
import org.infinispan.context.impl.TxInvocationContext;
import org.infinispan.distexec.DistributedCallable;
import org.infinispan.distribution.DistributionInfo;
import org.infinispan.distribution.TriangleOrderManager;
import org.infinispan.distribution.DistributionManager;
import org.infinispan.distribution.ch.ConsistentHash;
import org.infinispan.distribution.ch.KeyPartitioner;
import org.infinispan.executors.LimitedExecutor;
import org.infinispan.factories.KnownComponentNames;
import org.infinispan.factories.annotations.ComponentName;
import org.infinispan.factories.annotations.Inject;
import org.infinispan.factories.annotations.Start;
import org.infinispan.factories.annotations.Stop;
import org.infinispan.filter.KeyFilter;
import org.infinispan.interceptors.AsyncInterceptorChain;
import org.infinispan.notifications.cachelistener.CacheNotifier;
import org.infinispan.persistence.manager.PersistenceManager;
import org.infinispan.remoting.responses.CacheNotFoundResponse;
import org.infinispan.remoting.responses.Response;
import org.infinispan.remoting.responses.SuccessfulResponse;
import org.infinispan.remoting.rpc.ResponseMode;
import org.infinispan.remoting.rpc.RpcManager;
import org.infinispan.remoting.rpc.RpcOptions;
import org.infinispan.remoting.transport.Address;
import org.infinispan.remoting.transport.jgroups.SuspectException;
import org.infinispan.topology.CacheTopology;
import org.infinispan.topology.LocalTopologyManager;
import org.infinispan.transaction.impl.RemoteTransaction;
import org.infinispan.transaction.impl.TransactionTable;
import org.infinispan.transaction.totalorder.TotalOrderLatch;
import org.infinispan.transaction.totalorder.TotalOrderManager;
import org.infinispan.transaction.xa.CacheTransaction;
import org.infinispan.transaction.xa.GlobalTransaction;
import org.infinispan.util.concurrent.BlockingTaskAwareExecutorService;
import org.infinispan.util.concurrent.CommandAckCollector;
import org.infinispan.util.concurrent.TimeoutException;
import org.infinispan.util.logging.Log;
import org.infinispan.util.logging.LogFactory;

import net.jcip.annotations.GuardedBy;

/**
 * {@link StateConsumer} implementation.
 *
 * @author [email protected]
 * @since 5.2
 */
public class StateConsumerImpl implements StateConsumer {

   private static final Log log = LogFactory.getLog(StateConsumerImpl.class);
   private static final boolean trace = log.isTraceEnabled();
   private static final int NO_STATE_TRANSFER_IN_PROGRESS = -1;
   private static final long STATE_TRANSFER_FLAGS = EnumUtil.bitSetOf(PUT_FOR_STATE_TRANSFER, CACHE_MODE_LOCAL,
                                                                      IGNORE_RETURN_VALUES, SKIP_REMOTE_LOOKUP,
                                                                      SKIP_SHARED_CACHE_STORE, SKIP_OWNERSHIP_CHECK,
                                                                      SKIP_XSITE_BACKUP);

   private Cache cache;
   private StateTransferManager stateTransferManager;
   private LocalTopologyManager localTopologyManager;
   private String cacheName;
   private Configuration configuration;
   private RpcManager rpcManager;
   private TransactionManager transactionManager;   // optional
   private CommandsFactory commandsFactory;
   private TransactionTable transactionTable;       // optional
   private DataContainer dataContainer;
   private PersistenceManager persistenceManager;
   private AsyncInterceptorChain interceptorChain;
   private InvocationContextFactory icf;
   private StateTransferLock stateTransferLock;
   private CacheNotifier cacheNotifier;
   private TotalOrderManager totalOrderManager;
   private BlockingTaskAwareExecutorService remoteCommandsExecutor;
   private long timeout;
   private boolean isFetchEnabled;
   private boolean isTransactional;
   private boolean isInvalidationMode;
   private boolean isTotalOrder;
   private volatile KeyInvalidationListener keyInvalidationListener; //for test purpose only!
   private CommitManager commitManager;
   private ExecutorService stateTransferExecutor;
   private CommandAckCollector commandAckCollector;
   private TriangleOrderManager triangleOrderManager;
   private DistributionManager distributionManager;
   private KeyPartitioner keyPartitioner;

   private volatile CacheTopology cacheTopology;

   /**
    * Indicates if there is a state transfer in progress. It is set to the new topology id when onTopologyUpdate with
    * isRebalance==true is called.
    * It is changed back to NO_REBALANCE_IN_PROGRESS when a topology update with a null pending CH is received.
    */
   private final AtomicInteger stateTransferTopologyId = new AtomicInteger(NO_STATE_TRANSFER_IN_PROGRESS);

   /**
    * Indicates if there is a rebalance in progress and there the local node has not yet received
    * all the new segments yet. It is set to true when rebalance starts and becomes when all inbound transfers have completed
    * (before stateTransferTopologyId is set back to NO_REBALANCE_IN_PROGRESS).
    */
   private final AtomicBoolean waitingForState = new AtomicBoolean(false);

   private final Object transferMapsLock = new Object();

   /**
    * A map that keeps track of current inbound state transfers by source address. There could be multiple transfers
    * flowing in from the same source (but for different segments) so the values are lists. This works in tandem with
    * transfersBySegment so they always need to be kept in sync and updates to both of them need to be atomic.
    */
   @GuardedBy("transferMapsLock")
   private final Map> transfersBySource = new HashMap<>();

   /**
    * A map that keeps track of current inbound state transfers by segment id. There is at most one transfers per segment.
    * This works in tandem with transfersBySource so they always need to be kept in sync and updates to both of them
    * need to be atomic.
    */
   @GuardedBy("transferMapsLock")
   private final Map transfersBySegment = new HashMap<>();

   /**
    * Push RPCs on a background thread
    */
   private LimitedExecutor stateRequestExecutor;

   private volatile boolean ownsData = false;

   private RpcOptions synchronousRpcOptions;
   private RpcOptions synchronousIgnoreLeaversRpcOptions;

   public StateConsumerImpl() {
   }

   /**
    * Stops applying incoming state. Also stops tracking updated keys. Should be called at the end of state transfer or
    * when a ClearCommand is committed during state transfer.
    */
   @Override
   public void stopApplyingState() {
      if (trace) log.tracef("Stop keeping track of changed keys for state transfer");
      commitManager.stopTrack(PUT_FOR_STATE_TRANSFER);
   }

   @Inject
   public void init(Cache cache,
                    @ComponentName(STATE_TRANSFER_EXECUTOR) ExecutorService stateTransferExecutor,
                    StateTransferManager stateTransferManager,
                    LocalTopologyManager localTopologyManager,
                    AsyncInterceptorChain interceptorChain,
                    InvocationContextFactory icf,
                    Configuration configuration,
                    RpcManager rpcManager,
                    TransactionManager transactionManager,
                    CommandsFactory commandsFactory,
                    PersistenceManager persistenceManager,
                    DataContainer dataContainer,
                    TransactionTable transactionTable,
                    StateTransferLock stateTransferLock,
                    CacheNotifier cacheNotifier,
                    TotalOrderManager totalOrderManager,
                    @ComponentName(
                          KnownComponentNames.REMOTE_COMMAND_EXECUTOR) BlockingTaskAwareExecutorService remoteCommandsExecutor,
                    CommitManager commitManager,
                    CommandAckCollector commandAckCollector,
                    TriangleOrderManager triangleOrderManager,
                    DistributionManager distributionManager, KeyPartitioner keyPartitioner) {
      this.cache = cache;
      this.cacheName = cache.getName();
      this.stateTransferExecutor = stateTransferExecutor;
      this.stateTransferManager = stateTransferManager;
      this.localTopologyManager = localTopologyManager;
      this.interceptorChain = interceptorChain;
      this.icf = icf;
      this.configuration = configuration;
      this.rpcManager = rpcManager;
      this.transactionManager = transactionManager;
      this.commandsFactory = commandsFactory;
      this.persistenceManager = persistenceManager;
      this.dataContainer = dataContainer;
      this.transactionTable = transactionTable;
      this.stateTransferLock = stateTransferLock;
      this.cacheNotifier = cacheNotifier;
      this.totalOrderManager = totalOrderManager;
      this.remoteCommandsExecutor = remoteCommandsExecutor;
      this.commitManager = commitManager;
      this.commandAckCollector = commandAckCollector;
      this.triangleOrderManager = triangleOrderManager;
      this.distributionManager = distributionManager;
      this.keyPartitioner = keyPartitioner;

      isInvalidationMode = configuration.clustering().cacheMode().isInvalidation();

      isTransactional = configuration.transaction().transactionMode().isTransactional();
      isTotalOrder = configuration.transaction().transactionProtocol().isTotalOrder();

      timeout = configuration.clustering().stateTransfer().timeout();
   }

   public boolean hasActiveTransfers() {
      synchronized (transferMapsLock) {
         return !transfersBySource.isEmpty();
      }
   }

   @Override
   public boolean isStateTransferInProgress() {
      return stateTransferTopologyId.get() != NO_STATE_TRANSFER_IN_PROGRESS;
   }

   @Override
   public boolean isStateTransferInProgressForKey(Object key) {
      if (isInvalidationMode) {
         // In invalidation mode it is of not much relevance if the key is actually being transferred right now.
         // A false response to this will just mean the usual remote lookup before a write operation is not
         // performed and a null is assumed. But in invalidation mode the user must expect the data can disappear
         // from cache at any time so this null previous value should not cause any trouble.
         return false;
      }

      DistributionInfo distributionInfo = distributionManager.getCacheTopology().getDistribution(key);
      return distributionInfo.isWriteOwner() && !distributionInfo.isReadOwner();
   }

   @Override
   public boolean ownsData() {
      return ownsData;
   }

   @Override
   public void onTopologyUpdate(final CacheTopology cacheTopology, final boolean isRebalance) {
      final boolean isMember = cacheTopology.getMembers().contains(rpcManager.getAddress());
      if (trace) log.tracef("Received new topology for cache %s, isRebalance = %b, isMember = %b, topology = %s", cacheName, isRebalance, isMember, cacheTopology);

      if (!ownsData && isMember) {
         ownsData = true;
      } else if (ownsData && !isMember) {
         // This can happen after a merge, if the local node was in a minority partition.
         ownsData = false;
      }

      // If a member leaves/crashes immediately after a rebalance was started, the new CH_UPDATE
      // command may be executed before the REBALANCE_START command, so it has to start the rebalance.
      boolean startRebalance = isRebalance;
      if (!isRebalance) {
         if (cacheTopology.getPendingCH() != null && this.cacheTopology.getPendingCH() == null) {
            if (trace) log.tracef("Forcing startRebalance = true");
            startRebalance = true;
         }
      }
      if (startRebalance) {
         // Only update the rebalance topology id when starting the rebalance, as we're going to ignore any state
         // response with a smaller topology id
         stateTransferTopologyId.compareAndSet(NO_STATE_TRANSFER_IN_PROGRESS, cacheTopology.getTopologyId());
         cacheNotifier.notifyDataRehashed(cacheTopology.getCurrentCH(), cacheTopology.getPendingCH(),
                                          cacheTopology.getUnionCH(), cacheTopology.getTopologyId(), true);
      }

      awaitTotalOrderTransactions(cacheTopology, startRebalance);

      // Make sure we don't send a REBALANCE_CONFIRM command before we've added all the transfer tasks
      // even if some of the tasks are removed and re-added
      waitingForState.set(false);

      final ConsistentHash newWriteCh = cacheTopology.getWriteConsistentHash();
      final CacheTopology previousCacheTopology = this.cacheTopology;
      final ConsistentHash previousReadCh =
            previousCacheTopology != null ? previousCacheTopology.getReadConsistentHash() : null;
      final ConsistentHash previousWriteCh =
            previousCacheTopology != null ? previousCacheTopology.getWriteConsistentHash() : null;
      // Ensures writes to the data container use the right consistent hash
      // No need for a try/finally block, since it's just an assignment
      stateTransferLock.acquireExclusiveTopologyLock();
      this.cacheTopology = cacheTopology;
      triangleOrderManager.updateCacheTopology(cacheTopology);
      if (distributionManager != null) {
         distributionManager.setCacheTopology(cacheTopology);
      }
      if (startRebalance) {
         if (trace) log.tracef("Start keeping track of keys for rebalance");
         commitManager.stopTrack(PUT_FOR_STATE_TRANSFER);
         commitManager.startTrack(PUT_FOR_STATE_TRANSFER);
      }
      stateTransferLock.releaseExclusiveTopologyLock();
      stateTransferLock.notifyTopologyInstalled(cacheTopology.getTopologyId());
      remoteCommandsExecutor.checkForReadyTasks();

      try {
         // fetch transactions and data segments from other owners if this is enabled
         if (isTransactional || isFetchEnabled) {
            Set addedSegments;
            if (previousWriteCh == null) {
               // If we have any segments assigned in the initial CH, it means we are the first member.
               // If we are not the first member, we can only add segments via rebalance.
               addedSegments = Collections.emptySet();

               // TODO Perhaps we should only do this once we are a member, as listener installation should happen only on cache members?
               if (configuration.clustering().cacheMode().isDistributed()) {
                  Collection callables = getClusterListeners(cacheTopology);
                  for (DistributedCallable callable : callables) {
                     callable.setEnvironment(cache, null);
                     try {
                        callable.call();
                     } catch (Exception e) {
                        log.clusterListenerInstallationFailure(e);
                     }
                  }
               }

               if (trace) {
                  log.tracef("On cache %s we have: added segments: %s", cacheName, addedSegments);
               }
            } else {
               Set previousSegments = getOwnedSegments(previousWriteCh);
               Set newSegments = getOwnedSegments(newWriteCh);

               SmallIntSet removedSegments;
               if (newSegments.size() == newWriteCh.getNumSegments()) {
                  // Optimization for replicated caches
                  removedSegments = new SmallIntSet();
               } else {
                  removedSegments = new SmallIntSet(previousSegments);
                  removedSegments.removeAll(newSegments);
               }

               // This is a rebalance, we need to request the segments we own in the new CH.
               addedSegments = new SmallIntSet(newSegments);
               addedSegments.removeAll(previousSegments);

               if (trace) {
                  log.tracef("On cache %s we have: new segments: %s; old segments: %s", cacheName, newSegments, previousSegments);
                  log.tracef("On cache %s we have: added segments: %s; removed segments: %s", cacheName, addedSegments, removedSegments);
               }

               // remove inbound transfers for segments we no longer own
               cancelTransfers(removedSegments);

               if (!startRebalance && !addedSegments.isEmpty()) {
                  // If the last owner of a segment leaves the cluster, a new set of owners is assigned,
                  // but the new owners should not try to retrieve the segment from each other.
                  // If this happens during a rebalance, we might have already sent our rebalance
                  // confirmation, so the coordinator won't wait for us to retrieve those segments anyway.
                  log.debugf("Not requesting segments %s because the last owner left the cluster",
                        addedSegments);
                  addedSegments.clear();
               }

               // check if any of the existing transfers should be restarted from a different source because
               // the initial source is no longer a member
               restartBrokenTransfers(cacheTopology, addedSegments);
            }

            if (!addedSegments.isEmpty()) {
               // add transfers for new or restarted segments
               addTransfers(addedSegments);
            }
         }

         int stateTransferTopologyId = this.stateTransferTopologyId.get();
         if (trace) log.tracef("Topology update processed, stateTransferTopologyId = %d, startRebalance = %s, pending CH = %s",
               (Object)stateTransferTopologyId, startRebalance, cacheTopology.getPendingCH());
         if (stateTransferTopologyId != NO_STATE_TRANSFER_IN_PROGRESS && !startRebalance && cacheTopology.getPhase() != CacheTopology.Phase.READ_OLD_WRITE_ALL) {
            // we have received a topology update that does not start a state transfer, and neither is a members update
            // due to crashed member during state transfer
            boolean changed = this.stateTransferTopologyId.compareAndSet(stateTransferTopologyId, NO_STATE_TRANSFER_IN_PROGRESS);
            if (changed) {
               stopApplyingState();

               // if the coordinator changed, we might get two concurrent topology updates,
               // but we only want to notify the @DataRehashed listeners once
               cacheNotifier.notifyDataRehashed(previousReadCh, cacheTopology.getPendingCH(), previousWriteCh,
                     cacheTopology.getTopologyId(), false);
               if (trace) {
                  log.tracef("Unlock State Transfer in Progress for topology ID %s", cacheTopology.getTopologyId());
               }
               if (isTotalOrder) {
                  totalOrderManager.notifyStateTransferEnd();
               }
            }
         }
      } finally {
         stateTransferLock.notifyTransactionDataReceived(cacheTopology.getTopologyId());
         remoteCommandsExecutor.checkForReadyTasks();

         // Only set the flag here, after all the transfers have been added to the transfersBySource map
         if (stateTransferTopologyId.get() != NO_STATE_TRANSFER_IN_PROGRESS && isMember) {
            waitingForState.set(true);
         }

         notifyEndOfStateTransferIfNeeded(cacheTopology.getTopologyId(), cacheTopology.getRebalanceId());

         // Remove the transactions whose originators have left the cache.
         // Need to do it now, after we have applied any transactions from other nodes,
         // and after notifyTransactionDataReceived - otherwise the RollbackCommands would block.
         try {
            if (transactionTable != null) {
               transactionTable.cleanupLeaverTransactions(rpcManager.getTransport().getMembers());
            }
         } catch (Exception e) {
            // Do not fail state transfer when the cleanup fails. See ISPN-7437 for details.
            log.transactionCleanupError(e);
         }

         commandAckCollector.onMembersChange(newWriteCh.getMembers());

         // The rebalance (READ_OLD_WRITE_ALL) is confirmed through notifyEndOfRebalanceIfNeeded
         // and STABLE does not have to be confirmed at all
         switch (cacheTopology.getPhase()) {
            case READ_ALL_WRITE_ALL:
            case READ_NEW_WRITE_ALL:
               localTopologyManager.confirmRebalancePhase(cacheName, cacheTopology.getTopologyId(), cacheTopology.getRebalanceId(), null);
         }

         // Any data for segments we do not own should be removed from data container and cache store
         // We need to discard data from all segments we don't own, not just those we previously owned,
         // when we lose membership (e.g. because there was a merge, the local partition was in degraded mode
         // and the other partition was available) or when L1 is enabled.
         Set removedSegments;
         boolean wasMember =
               previousWriteCh != null && previousWriteCh.getMembers().contains(rpcManager.getAddress());
         if ((isMember || wasMember) && cacheTopology.getPhase() == CacheTopology.Phase.NO_REBALANCE) {
            removedSegments = new HashSet<>(newWriteCh.getNumSegments());
            for (int i = 0; i < newWriteCh.getNumSegments(); i++) {
               removedSegments.add(i);
            }
            Set newSegments = getOwnedSegments(newWriteCh);
            removedSegments.removeAll(newSegments);

            try {
               removeStaleData(removedSegments);
            } catch (InterruptedException e) {
               Thread.currentThread().interrupt();
               throw new CacheException(e);
            }
         }
      }
   }

   private void awaitTotalOrderTransactions(CacheTopology cacheTopology, boolean isRebalance) {
      //in total order, we should wait for remote transactions before proceeding
      if (isTotalOrder) {
         if (trace) {
            log.trace("State Transfer in Total Order cache. Waiting for remote transactions to finish");
         }
         try {
            for (TotalOrderLatch block : totalOrderManager.notifyStateTransferStart(cacheTopology.getTopologyId(), isRebalance)) {
               block.awaitUntilUnBlock();
            }
         } catch (InterruptedException e) {
            //interrupted...
            Thread.currentThread().interrupt();
            throw new CacheException(e);
         }
         if (trace) {
            log.trace(
                  "State Transfer in Total Order cache. All remote transactions are finished. Moving on...");
         }
      }

   }

   private void notifyEndOfStateTransferIfNeeded(int topologyId, int rebalanceId) {
      if (waitingForState.get() && !hasActiveTransfers()) {
         if (waitingForState.compareAndSet(true, false)) {
            log.debugf("Finished receiving of segments for cache %s for topology %d.", cacheName, topologyId);
            stopApplyingState();
            stateTransferManager.notifyEndOfStateTransfer(topologyId, rebalanceId);
         }
      }
   }

   private Set getOwnedSegments(ConsistentHash consistentHash) {
      Address address = rpcManager.getAddress();
      return consistentHash.getMembers().contains(address) ? consistentHash.getSegmentsForOwner(address)
            : Collections.emptySet();
   }

   @Override
   public void applyState(final Address sender, int topologyId, Collection stateChunks) {
      ConsistentHash wCh = cacheTopology.getWriteConsistentHash();
      // Ignore responses received after we are no longer a member
      if (!wCh.getMembers().contains(rpcManager.getAddress())) {
         if (trace) {
            log.tracef("Ignoring received state because we are no longer a member of cache %s", cacheName);
         }
         return;
      }

      // Ignore segments that we requested for a previous rebalance
      // Can happen when the coordinator leaves, and the new coordinator cancels the rebalance in progress
      int rebalanceTopologyId = stateTransferTopologyId.get();
      if (rebalanceTopologyId == NO_STATE_TRANSFER_IN_PROGRESS) {
         log.debugf("Discarding state response with topology id %d for cache %s, we don't have a state transfer in progress",
               topologyId, cacheName);
         return;
      }
      if (topologyId < rebalanceTopologyId) {
         log.debugf("Discarding state response with old topology id %d for cache %s, state transfer request topology was %b",
               topologyId, cacheName, waitingForState);
         return;
      }

      if (trace) {
         log.tracef("Before applying the received state the data container of cache %s has %d keys", cacheName,
                    dataContainer.sizeIncludingExpired());
      }
      final Set mySegments = wCh.getSegmentsForOwner(rpcManager.getAddress());
      final CountDownLatch countDownLatch = new CountDownLatch(stateChunks.size());
      for (final StateChunk stateChunk : stateChunks) {
         stateTransferExecutor.submit(() -> {
            applyChunk(sender, mySegments, stateChunk);
            countDownLatch.countDown();
         });
      }
      try {
         boolean await = countDownLatch.await(timeout, TimeUnit.MILLISECONDS);
         if (!await) {
            throw new TimeoutException("Timed out applying state");
         }
      } catch (InterruptedException e) {
         Thread.currentThread().interrupt();
         throw new CacheException(e);
      }

      if (trace) {
         log.tracef("After applying the received state the data container of cache %s has %d keys", cacheName,
                    dataContainer.sizeIncludingExpired());
         synchronized (transferMapsLock) {
            log.tracef("Segments not received yet for cache %s: %s", cacheName, transfersBySource);
         }
      }
   }

   private void applyChunk(Address sender, Set mySegments, StateChunk stateChunk) {
      if (!mySegments.contains(stateChunk.getSegmentId())) {
         log.warnf("Discarding received cache entries for segment %d of cache %s because they do not belong to this node.", stateChunk.getSegmentId(), cacheName);
         return;
      }

      // Notify the inbound task that a chunk of cache entries was received
      InboundTransferTask inboundTransfer;
      synchronized (transferMapsLock) {
         inboundTransfer = transfersBySegment.get(stateChunk.getSegmentId());
      }
      if (inboundTransfer != null) {
         if (stateChunk.getCacheEntries() != null) {
            doApplyState(sender, stateChunk.getSegmentId(), stateChunk.getCacheEntries());
         }

         inboundTransfer.onStateReceived(stateChunk.getSegmentId(), stateChunk.isLastChunk());
      } else {
         if (cache.getStatus().allowInvocations()) {
            log.ignoringUnsolicitedState(sender, stateChunk.getSegmentId(), cacheName);
         }
      }
   }

   private void doApplyState(Address sender, int segmentId, Collection cacheEntries) {
      if (trace) log.tracef("Applying new state chunk for segment %d of cache %s from node %s: received %d cache entries",
            segmentId, cacheName, sender, cacheEntries.size());

      // CACHE_MODE_LOCAL avoids handling by StateTransferInterceptor and any potential locks in StateTransferLock
      boolean transactional = transactionManager != null;
      for (InternalCacheEntry e : cacheEntries) {
         try {
            InvocationContext ctx;
            if (transactional) {
               transactionManager.begin();
               ctx = icf.createInvocationContext(transactionManager.getTransaction(), true);
               ((TxInvocationContext) ctx).getCacheTransaction().setStateTransferFlag(PUT_FOR_STATE_TRANSFER);
            } else {
               // non-tx cache
               ctx = icf.createSingleKeyNonTxInvocationContext();
            }

            PutKeyValueCommand put = commandsFactory.buildPutKeyValueCommand(
                  e.getKey(), e.getValue(), e.getMetadata(), STATE_TRANSFER_FLAGS);
            ctx.setLockOwner(put.getKeyLockOwner());
            interceptorChain.invoke(ctx, put);

            if (transactionManager != null) {
               transactionManager.commit();
            }
         } catch (Exception ex) {
            if (!cache.getStatus().allowInvocations()) {
               log.debugf("Cache %s is shutting down, stopping state transfer", cacheName);
               break;
            } else {
               log.problemApplyingStateForKey(ex.getMessage(), e.getKey(), ex);
            }
         } finally {
            try {
               if (transactional && transactionManager.getTransaction() != null) {
                  transactionManager.rollback();
               }
            } catch (SystemException e1) {
               // Ignore
            }
         }
      }
      if (trace) log.tracef("Finished applying chunk of segment %d of cache %s", segmentId, cacheName);
   }

   private void applyTransactions(Address sender, Collection transactions, int topologyId) {
      log.debugf("Applying %d transactions for cache %s transferred from node %s", transactions.size(), cacheName, sender);
      if (isTransactional) {
         for (TransactionInfo transactionInfo : transactions) {
            GlobalTransaction gtx = transactionInfo.getGlobalTransaction();
            if (rpcManager.getAddress().equals(gtx.getAddress())) {
               continue; // it is a transaction originated in this node. can happen with partition handling
            }
            // Mark the global transaction as remote. Only used for logging, hashCode/equals ignore it.
            gtx.setRemote(true);

            CacheTransaction tx = transactionTable.getLocalTransaction(gtx);
            if (tx == null) {
               tx = transactionTable.getRemoteTransaction(gtx);
               if (tx == null) {
                  tx = transactionTable.getOrCreateRemoteTransaction(gtx, transactionInfo.getModifications());
                  // Force this node to replay the given transaction data by making it think it is 1 behind
                  ((RemoteTransaction) tx).setLookedUpEntriesTopology(topologyId - 1);
               }
            }
            // TODO Shouldn't this be done for transactions originated locally as well?
            transactionInfo.getLockedKeys().forEach(tx::addBackupLockForKey);
         }
      }
   }

   // Must run after the PersistenceManager
   @Start(priority = 20)
   public void start() {
      CacheMode mode = configuration.clustering().cacheMode();
      isFetchEnabled = (mode.isDistributed() || mode.isReplicated()) &&
              (configuration.clustering().stateTransfer().fetchInMemoryState() || configuration.persistence().fetchPersistentState());
      //rpc options does not changes in runtime. we can use always the same instance.
      synchronousRpcOptions = rpcManager.getRpcOptionsBuilder(ResponseMode.SYNCHRONOUS)
            .timeout(timeout, TimeUnit.MILLISECONDS).build();
      synchronousIgnoreLeaversRpcOptions = rpcManager.getRpcOptionsBuilder(ResponseMode.SYNCHRONOUS_IGNORE_LEAVERS)
            .timeout(timeout, TimeUnit.MILLISECONDS).build();

      stateRequestExecutor = new LimitedExecutor("StateRequest-" + cacheName, stateTransferExecutor, 1);
   }

   @Stop(priority = 0)
   @Override
   public void stop() {
      if (trace) {
         log.tracef("Shutting down StateConsumer of cache %s on node %s", cacheName, rpcManager.getAddress());
      }

      try {
         synchronized (transferMapsLock) {
            // cancel all inbound transfers
            stateRequestExecutor.cancelQueuedTasks();

            for (List inboundTransfers : transfersBySource.values()) {
               inboundTransfers.forEach(InboundTransferTask::cancel);
            }
            transfersBySource.clear();
            transfersBySegment.clear();
         }
      } catch (Throwable t) {
         log.errorf(t, "Failed to stop StateConsumer of cache %s on node %s", cacheName, rpcManager.getAddress());
      }
   }

   @Override
   public CacheTopology getCacheTopology() {
      return cacheTopology;
   }

   public void setKeyInvalidationListener(KeyInvalidationListener keyInvalidationListener) {
      this.keyInvalidationListener = keyInvalidationListener;
   }

   private void addTransfers(Set segments) {
      log.debugf("Adding inbound state transfer for segments %s", segments);

      // the set of nodes that reported errors when fetching data from them - these will not be retried in this topology
      Set
excludedSources = new HashSet<>(); // the sources and segments we are going to get from each source Map> sources = new HashMap<>(); if (isTransactional && !isTotalOrder) { requestTransactions(segments, sources, excludedSources); } if (isFetchEnabled) { requestSegments(segments, sources, excludedSources); } if (trace) log.tracef("Finished adding inbound state transfer for segments %s", segments, cacheName); } private void findSources(Set segments, Map> sources, Set
excludedSources) { if (cache.getStatus().isTerminated()) return; SmallIntSet segmentsWithoutSource = new SmallIntSet(configuration.clustering().hash().numSegments()); for (Integer segmentId : segments) { Address source = findSource(segmentId, excludedSources); // ignore all segments for which there are no other owners to pull data from. // these segments are considered empty (or lost) and do not require a state transfer if (source != null) { Set segmentsFromSource = sources.computeIfAbsent(source, k -> new SmallIntSet()); segmentsFromSource.add(segmentId); } else { segmentsWithoutSource.set(segmentId); } } if (!segmentsWithoutSource.isEmpty()) { log.noLiveOwnersFoundForSegments(segmentsWithoutSource, cacheName, excludedSources); } } private Address findSource(int segmentId, Set
excludedSources) { List
owners = cacheTopology.getReadConsistentHash().locateOwnersForSegment(segmentId); if (!owners.contains(rpcManager.getAddress())) { // We prefer that transactions are sourced from primary owners. // Needed in pessimistic mode, if the originator is the primary owner of the key than the lock // command is not replicated to the backup owners. See PessimisticDistributionInterceptor.acquireRemoteIfNeeded. for (Address o : owners) { if (!o.equals(rpcManager.getAddress()) && !excludedSources.contains(o)) { return o; } } } return null; } private void requestTransactions(Set segments, Map> sources, Set
excludedSources) { findSources(segments, sources, excludedSources); boolean seenFailures = false; while (true) { SmallIntSet failedSegments = new SmallIntSet(); int topologyId = cacheTopology.getTopologyId(); for (Map.Entry> sourceEntry : sources.entrySet()) { Address source = sourceEntry.getKey(); Set segmentsFromSource = sourceEntry.getValue(); boolean failed = false; boolean exclude = false; try { Response response = getTransactions(source, segmentsFromSource, topologyId); if (response instanceof SuccessfulResponse) { List transactions = (List) ((SuccessfulResponse) response).getResponseValue(); applyTransactions(source, transactions, topologyId); } else if (response instanceof CacheNotFoundResponse) { log.debugf("Cache %s was stopped on node %s before sending transaction information", cacheName, source); failed = true; exclude = true; } else { log.unsuccessfulResponseRetrievingTransactionsForSegments(source, response); failed = true; } } catch (SuspectException e) { log.debugf("Node %s left the cluster before sending transaction information", source); failed = true; exclude = true; } catch (Exception e) { if (cache.getStatus().isTerminated()) { log.debugf("Cache %s has stopped while requesting transactions", cacheName); sources.clear(); return; } else { log.failedToRetrieveTransactionsForSegments(cacheName, source, segments, e); } // The primary owner is still in the cluster, so we can't exclude it - see ISPN-4091 failed = true; } // If requesting the transactions failed we need to retry if (failed) { failedSegments.addAll(segmentsFromSource); } // If the primary owner is no longer running, we can retry on a backup owner if (exclude) { excludedSources.add(source); } } if (failedSegments.isEmpty()) { break; } // look for other sources for all failed segments seenFailures = true; sources.clear(); findSources(failedSegments, sources, excludedSources); } if (seenFailures) { // start fresh when next step starts (fetching segments) sources.clear(); } } private Collection getClusterListeners(CacheTopology topology) { for (Address source : topology.getMembers()) { // Don't send to ourselves if (!source.equals(rpcManager.getAddress())) { if (trace) { log.tracef("Requesting cluster listeners of cache %s from node %s", cacheName, source); } // get cluster listeners try { StateRequestCommand cmd = commandsFactory.buildStateRequestCommand(StateRequestCommand.Type.GET_CACHE_LISTENERS, rpcManager.getAddress(), topology.getTopologyId(), null); Map responses = rpcManager.invokeRemotely(Collections.singleton(source), cmd, synchronousIgnoreLeaversRpcOptions); Response response = responses.get(source); if (response instanceof SuccessfulResponse) { return (Collection) ((SuccessfulResponse) response).getResponseValue(); } else { log.unsuccessfulResponseForClusterListeners(source, response); } } catch (CacheException e) { log.exceptionDuringClusterListenerRetrieval(source, e); } } } if (trace) log.trace("Unable to acquire cluster listeners from other members, assuming none are present"); return Collections.emptySet(); } private Response getTransactions(Address source, Set segments, int topologyId) { if (trace) { log.tracef("Requesting transactions from node %s for segments %s", source, segments); } // get transactions and locks StateRequestCommand cmd = commandsFactory.buildStateRequestCommand(StateRequestCommand.Type.GET_TRANSACTIONS, rpcManager.getAddress(), topologyId, segments); Map responses = rpcManager.invokeRemotely(Collections.singleton(source), cmd, synchronousRpcOptions); return responses.get(source); } private void requestSegments(Set segments, Map> sources, Set
excludedSources) { if (sources.isEmpty()) { findSources(segments, sources, excludedSources); } for (Map.Entry> e : sources.entrySet()) { addTransfer(e.getKey(), e.getValue()); } } /** * Cancel transfers for segments we no longer own. * * @param removedSegments segments to be cancelled */ private void cancelTransfers(Set removedSegments) { synchronized (transferMapsLock) { List segmentsToCancel = new ArrayList<>(removedSegments); while (!segmentsToCancel.isEmpty()) { int segmentId = segmentsToCancel.remove(0); InboundTransferTask inboundTransfer = transfersBySegment.get(segmentId); if (inboundTransfer != null) { // we need to check the transfer was not already completed Set cancelledSegments = new SmallIntSet(removedSegments); cancelledSegments.retainAll(inboundTransfer.getSegments()); segmentsToCancel.removeAll(cancelledSegments); transfersBySegment.keySet().removeAll(cancelledSegments); //this will also remove it from transfersBySource if the entire task gets cancelled inboundTransfer.cancelSegments(cancelledSegments); if (inboundTransfer.isCancelled()) { removeTransfer(inboundTransfer); } } } } } private void removeStaleData(final Set removedSegments) throws InterruptedException { log.debugf("Removing no longer owned entries for cache %s", cacheName); if (keyInvalidationListener != null) { keyInvalidationListener.beforeInvalidation(removedSegments, Collections.emptySet()); } if (removedSegments.isEmpty()) return; // Keys that we used to own, and need to be removed from the data container AND the cache stores final ConcurrentHashSet keysToRemove = new ConcurrentHashSet<>(); dataContainer.executeTask(KeyFilter.ACCEPT_ALL_FILTER, (o, ice) -> { Object key = ice.getKey(); int keySegment = getSegment(key); if (removedSegments.contains(keySegment)) { keysToRemove.add(key); } }); // gather all keys from cache store that belong to the segments that are being removed/moved to L1 if (!removedSegments.isEmpty()) { try { KeyFilter filter = key -> { if (dataContainer.containsKey(key)) return false; int keySegment = getSegment(key); return (removedSegments.contains(keySegment)); }; persistenceManager.processOnAllStores(filter, (marshalledEntry, taskContext) -> keysToRemove.add(marshalledEntry.getKey()), false, false, PRIVATE); } catch (CacheException e) { log.failedLoadingKeysFromCacheStore(e); } } if (!keysToRemove.isEmpty()) { try { InvalidateCommand invalidateCmd = commandsFactory.buildInvalidateCommand(EnumUtil.bitSetOf(CACHE_MODE_LOCAL, SKIP_LOCKING), keysToRemove.toArray()); InvocationContext ctx = icf.createNonTxInvocationContext(); ctx.setLockOwner(invalidateCmd.getKeyLockOwner()); interceptorChain.invoke(ctx, invalidateCmd); if (trace) log.tracef("Removed %d keys, data container now has %d keys", keysToRemove.size(), dataContainer.sizeIncludingExpired()); } catch (CacheException e) { log.failedToInvalidateKeys(e); } } } /** * Check if any of the existing transfers should be restarted from a different source because the initial source is no longer a member. */ private void restartBrokenTransfers(CacheTopology cacheTopology, Set addedSegments) { Set
members = new HashSet<>(cacheTopology.getReadConsistentHash().getMembers()); synchronized (transferMapsLock) { for (Iterator>> it = transfersBySource.entrySet().iterator(); it.hasNext(); ) { Map.Entry> entry = it.next(); Address source = entry.getKey(); if (!members.contains(source)) { if (trace) { log.tracef("Removing inbound transfers from source %s for cache %s", source, cacheName); } List inboundTransfers = entry.getValue(); it.remove(); for (InboundTransferTask inboundTransfer : inboundTransfers) { // these segments will be restarted if they are still in new write CH if (trace) { log.tracef("Removing inbound transfers from node %s for segments %s", source, inboundTransfer.getSegments()); } inboundTransfer.cancel(); transfersBySegment.keySet().removeAll(inboundTransfer.getSegments()); addedSegments.addAll(inboundTransfer.getUnfinishedSegments()); } } } // exclude those that are already in progress from a valid source addedSegments.removeAll(transfersBySegment.keySet()); } } private int getSegment(Object key) { // here we can use any CH version because the routing table is not involved in computing the segment return keyPartitioner.getSegment(key); } private InboundTransferTask addTransfer(Address source, Set segmentsFromSource) { final InboundTransferTask inboundTransfer; synchronized (transferMapsLock) { if (trace) { log.tracef("Adding transfer from %s for segments %s", source, segmentsFromSource); } segmentsFromSource.removeAll(transfersBySegment.keySet()); // already in progress segments are excluded if (segmentsFromSource.isEmpty()) { if (trace) { log.tracef("All segments are already in progress, skipping"); } return null; } inboundTransfer = new InboundTransferTask(segmentsFromSource, source, cacheTopology.getTopologyId(), rpcManager, commandsFactory, timeout, cacheName); for (int segmentId : segmentsFromSource) { transfersBySegment.put(segmentId, inboundTransfer); } List inboundTransfers = transfersBySource .computeIfAbsent(inboundTransfer.getSource(), k -> new ArrayList<>()); inboundTransfers.add(inboundTransfer); } stateRequestExecutor.executeAsync(() -> { CompletableFuture transferStarted = inboundTransfer.requestSegments(); if (trace) log.tracef("Waiting for inbound transfer to finish: %s", inboundTransfer); return transferStarted.whenComplete((aVoid, throwable) -> onTaskCompletion(inboundTransfer)); }); return inboundTransfer; } private boolean removeTransfer(InboundTransferTask inboundTransfer) { synchronized (transferMapsLock) { if (trace) log.tracef("Removing inbound transfers from node %s for segments %s", inboundTransfer.getSegments(), inboundTransfer.getSource(), cacheName); List transfers = transfersBySource.get(inboundTransfer.getSource()); if (transfers != null) { if (transfers.remove(inboundTransfer)) { if (transfers.isEmpty()) { transfersBySource.remove(inboundTransfer.getSource()); } transfersBySegment.keySet().removeAll(inboundTransfer.getSegments()); return true; } } } return false; } void onTaskCompletion(final InboundTransferTask inboundTransfer) { if (trace) log.tracef("Inbound transfer finished: %s", inboundTransfer); if (inboundTransfer.isCompletedSuccessfully()) { removeTransfer(inboundTransfer); notifyEndOfStateTransferIfNeeded(cacheTopology.getTopologyId(), cacheTopology.getRebalanceId()); } } public interface KeyInvalidationListener { void beforeInvalidation(Set removedSegments, Set staleL1Segments); } }