All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.infinispan.interceptors.distribution.TriangleDistributionInterceptor Maven / Gradle / Ivy

There is a newer version: 9.1.7.Final
Show newest version
package org.infinispan.interceptors.distribution;

import static org.infinispan.commands.VisitableCommand.LoadType.OWNER;
import static org.infinispan.commands.VisitableCommand.LoadType.PRIMARY;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CompletableFuture;

import org.infinispan.commands.CommandInvocationId;
import org.infinispan.commands.CommandsFactory;
import org.infinispan.commands.FlagAffectedCommand;
import org.infinispan.commands.TopologyAffectedCommand;
import org.infinispan.commands.VisitableCommand;
import org.infinispan.commands.read.GetCacheEntryCommand;
import org.infinispan.commands.read.GetKeyValueCommand;
import org.infinispan.commands.write.AbstractDataWriteCommand;
import org.infinispan.commands.write.BackupAckCommand;
import org.infinispan.commands.write.BackupMultiKeyAckCommand;
import org.infinispan.commands.write.BackupPutMapRcpCommand;
import org.infinispan.commands.write.BackupWriteRcpCommand;
import org.infinispan.commands.write.DataWriteCommand;
import org.infinispan.commands.write.PrimaryAckCommand;
import org.infinispan.commands.write.PrimaryMultiKeyAckCommand;
import org.infinispan.commands.write.PutKeyValueCommand;
import org.infinispan.commands.write.PutMapCommand;
import org.infinispan.commands.write.RemoveCommand;
import org.infinispan.commands.write.ReplaceCommand;
import org.infinispan.container.entries.CacheEntry;
import org.infinispan.context.InvocationContext;
import org.infinispan.context.impl.FlagBitSets;
import org.infinispan.distribution.DistributionInfo;
import org.infinispan.distribution.TriangleOrderManager;
import org.infinispan.distribution.ch.ConsistentHash;
import org.infinispan.factories.annotations.Inject;
import org.infinispan.factories.annotations.Start;
import org.infinispan.interceptors.BasicInvocationStage;
import org.infinispan.interceptors.TriangleAckInterceptor;
import org.infinispan.remoting.inboundhandler.DeliverOrder;
import org.infinispan.remoting.transport.Address;
import org.infinispan.statetransfer.OutdatedTopologyException;
import org.infinispan.statetransfer.StateTransferInterceptor;
import org.infinispan.topology.CacheTopology;
import org.infinispan.util.concurrent.CommandAckCollector;
import org.infinispan.util.concurrent.CompletableFutures;
import org.infinispan.util.logging.Log;
import org.infinispan.util.logging.LogFactory;

/**
 * Non-transactional interceptor used by distributed caches that supports concurrent writes.
 * 

* It is implemented based on the Triangle algorithm. *

* The {@link GetKeyValueCommand} reads the value locally if it is available (the node is an owner or the value is * stored in L1). If it isn't available, a remote request is made. * The {@link DataWriteCommand} is performed as follow: *

    *
  • The command if forwarded to the primary owner of the key.
  • *
  • The primary owner locks the key and executes the operation; sends the {@link BackupWriteRcpCommand} to the backup * owners; releases the lock; sends the {@link PrimaryAckCommand} back to the originator.
  • *
  • The backup owner applies the update and sends a {@link BackupAckCommand} back to the originator.
  • *
  • The originator collects the ack from all the owners and returns.
  • *
* The {@link PutMapCommand} is performed in a similar way: *
    *
  • The subset of the map is split by primary owner.
  • *
  • The primary owner locks the key and executes the command; splits the keys by backup owner and send them; * releases the locks and sends the {@link PrimaryMultiKeyAckCommand} back to the originator.
  • *
  • The backup owner applies the update and sends back the {@link BackupMultiKeyAckCommand} to the originator.
  • *
  • The originator collects all the acknowledges from all owners and returns.
  • *
* The acknowledges management is done in the {@link TriangleAckInterceptor} by the {@link CommandAckCollector}. *

* If a topology changes while a command is executed, an {@link OutdatedTopologyException} is thrown. The {@link * StateTransferInterceptor} will catch it and retries the command. *

* TODO: finish the wiki page and add a link to it! * * @author Pedro Ruivo * @since 9.0 */ public class TriangleDistributionInterceptor extends NonTxDistributionInterceptor { private static final Log log = LogFactory.getLog(TriangleDistributionInterceptor.class); private static final boolean trace = log.isTraceEnabled(); private CommandAckCollector commandAckCollector; private CommandsFactory commandsFactory; private TriangleOrderManager triangleOrderManager; private Address localAddress; @Inject public void inject(CommandAckCollector commandAckCollector, CommandsFactory commandsFactory, TriangleOrderManager triangleOrderManager) { this.commandAckCollector = commandAckCollector; this.commandsFactory = commandsFactory; this.triangleOrderManager = triangleOrderManager; } @Start public void start() { localAddress = rpcManager.getAddress(); } @Override public BasicInvocationStage visitPutKeyValueCommand(InvocationContext ctx, PutKeyValueCommand command) throws Throwable { return handleDataWriteCommand(ctx, command); } @Override public BasicInvocationStage visitPutMapCommand(InvocationContext ctx, PutMapCommand command) throws Throwable { if (ctx.isOriginLocal()) { return handleLocalPutMapCommand(ctx, command); } else { return handleRemotePutMapCommand(ctx, command); } } @Override public BasicInvocationStage visitRemoveCommand(InvocationContext ctx, RemoveCommand command) throws Throwable { return handleDataWriteCommand(ctx, command); } @Override public BasicInvocationStage visitReplaceCommand(InvocationContext ctx, ReplaceCommand command) throws Throwable { return handleDataWriteCommand(ctx, command); } private BasicInvocationStage handleRemotePutMapCommand(InvocationContext ctx, PutMapCommand command) { CacheTopology cacheTopology = checkTopologyId(command); final ConsistentHash ch = cacheTopology.getWriteConsistentHash(); final VisitableCommand.LoadType loadType = command.loadType(); if (command.isForwarded() || ch.getNumOwners() == 1) { //backup & remote || no backups return invokeNextAsync(ctx, command, checkRemoteGetIfNeeded(ctx, command, command.getMap().keySet(), ch, loadType == OWNER)); } //primary, we need to send the command to the backups ordered! sendToBackups(command, command.getMap(), ch); return invokeNextAsync(ctx, command, checkRemoteGetIfNeeded(ctx, command, command.getMap().keySet(), ch, loadType == OWNER)); } private void sendToBackups(PutMapCommand command, Map entries, ConsistentHash ch) { BackupOwnerClassifier filter = new BackupOwnerClassifier(ch); entries.entrySet().forEach(filter::add); int topologyId = command.getTopologyId(); for (Map.Entry> entry : filter.perSegmentKeyValue.entrySet()) { int segmentId = entry.getKey(); List

owners = ch.locateOwnersForSegment(segmentId); int size = owners.size(); if (size == 1) { //only the primary owner continue; //or break? can we have another segment with backups? } Map map = entry.getValue(); long sequence = triangleOrderManager.next(segmentId, topologyId); BackupPutMapRcpCommand backupPutMapRcpCommand = commandsFactory.buildBackupPutMapRcpCommand(command); backupPutMapRcpCommand.setMap(map); backupPutMapRcpCommand.setSequence(sequence); if (trace) { logCommandSequence(command.getCommandInvocationId(), segmentId, sequence); } rpcManager.sendToMany(owners.subList(1, size), backupPutMapRcpCommand, DeliverOrder.NONE); } } private BasicInvocationStage handleLocalPutMapCommand(InvocationContext ctx, PutMapCommand command) { //local command. we need to split by primary owner to send the command to them final CacheTopology cacheTopology = checkTopologyId(command); final ConsistentHash consistentHash = cacheTopology.getWriteConsistentHash(); final PrimaryOwnerClassifier filter = new PrimaryOwnerClassifier(consistentHash); final boolean sync = isSynchronous(command); final VisitableCommand.LoadType loadType = command.loadType(); command.getMap().entrySet().forEach(filter::add); if (sync) { commandAckCollector .createMultiKeyCollector(command.getCommandInvocationId(), filter.primaries.keySet(), filter.backups, command.getTopologyId()); final Map localEntries = filter.primaries.remove(localAddress); forwardToPrimaryOwners(command, filter); if (localEntries != null) { sendToBackups(command, localEntries, consistentHash); return invokeNextAsync(ctx, command, checkRemoteGetIfNeeded(ctx, command, localEntries.keySet(), consistentHash, loadType == PRIMARY || loadType == OWNER)).handle((rCtx, rCommand, rv, t) -> { PutMapCommand cmd = (PutMapCommand) rCommand; if (t != null) { commandAckCollector.completeExceptionally(cmd.getCommandInvocationId(), t, cmd.getTopologyId()); } else { //noinspection unchecked commandAckCollector.multiKeyPrimaryAck(cmd.getCommandInvocationId(), localAddress, (Map) rv, cmd.getTopologyId()); } }); } return invokeNext(ctx, command).exceptionally((rCtx, rCommand, t) -> { PutMapCommand cmd = (PutMapCommand) rCommand; commandAckCollector.completeExceptionally(cmd.getCommandInvocationId(), t, cmd.getTopologyId()); assert t != null; throw t; }); } final Map localEntries = filter.primaries.remove(localAddress); forwardToPrimaryOwners(command, filter); if (localEntries != null) { sendToBackups(command, localEntries, consistentHash); return invokeNextAsync(ctx, command, checkRemoteGetIfNeeded(ctx, command, localEntries.keySet(), consistentHash, loadType == PRIMARY || loadType == OWNER)); } return invokeNext(ctx, command); } private CompletableFuture checkRemoteGetIfNeeded( InvocationContext ctx, C command, Set keys, ConsistentHash consistentHash, boolean needsPreviousValue) { if (!needsPreviousValue) { for (Object key : keys) { CacheEntry cacheEntry = ctx.lookupEntry(key); if (cacheEntry == null && consistentHash.isKeyLocalToNode(localAddress, key)) { entryFactory.wrapExternalEntry(ctx, key, null, true); } } return CompletableFutures.completedNull(); } final List> futureList = new ArrayList<>(keys.size()); for (Object key : keys) { CacheEntry cacheEntry = ctx.lookupEntry(key); if (cacheEntry == null && consistentHash.isKeyLocalToNode(localAddress, key)) { wrapKeyExternally(ctx, command, key, futureList); } } final int size = futureList.size(); if (size == 0) { return CompletableFutures.completedNull(); } CompletableFuture[] array = new CompletableFuture[size]; futureList.toArray(array); return CompletableFuture.allOf(array); } private void wrapKeyExternally(InvocationContext ctx, C command, Object key, List> futureList) { if (command.hasAnyFlag(FlagBitSets.SKIP_REMOTE_LOOKUP | FlagBitSets.CACHE_MODE_LOCAL)) { entryFactory.wrapExternalEntry(ctx, key, null, true); } else { GetCacheEntryCommand fakeGetCommand = cf.buildGetCacheEntryCommand(key, command.getFlagsBitSet()); fakeGetCommand.setTopologyId(command.getTopologyId()); futureList.add(remoteGet(ctx, fakeGetCommand, key, true)); } } private void forwardToPrimaryOwners(PutMapCommand command, PrimaryOwnerClassifier splitter) { for (Map.Entry> entry : splitter.primaries.entrySet()) { PutMapCommand copy = new PutMapCommand(command, false); copy.setMap(entry.getValue()); rpcManager.sendTo(entry.getKey(), copy, DeliverOrder.NONE); } } // TODO: this should just override handleNonTxWriteCommand when functional commands will be triangelized private BasicInvocationStage handleDataWriteCommand(InvocationContext context, AbstractDataWriteCommand command) { assert !context.isInTxScope(); if (command.hasAnyFlag(FlagBitSets.CACHE_MODE_LOCAL)) { //don't go through the triangle return invokeNext(context, command); } final CacheTopology topology = checkTopologyId(command); DistributionInfo distributionInfo = new DistributionInfo(command.getKey(), topology.getWriteConsistentHash(), localAddress); switch (distributionInfo.ownership()) { case PRIMARY: assert context.lookupEntry(command.getKey()) != null; return primaryOwnerWrite(context, command, distributionInfo); case BACKUP: if (context.isOriginLocal()) { return localWriteInvocation(context, command, distributionInfo); } else { CacheEntry entry = context.lookupEntry(command.getKey()); if (entry == null) { if (command.loadType() == OWNER) { return invokeNextAsync(context, command, remoteGet(context, command, command.getKey(), true)); } entryFactory.wrapExternalEntry(context, command.getKey(), null, true); } return invokeNext(context, command); } case NON_OWNER: //always local! assert context.isOriginLocal(); return localWriteInvocation(context, command, distributionInfo); } throw new IllegalStateException(); } private BasicInvocationStage primaryOwnerWrite(InvocationContext context, DataWriteCommand command, final DistributionInfo distributionInfo) { //we are the primary owner. we need to execute the command, check if successful, send to backups and reply to originator is needed. if (command.hasAnyFlag(FlagBitSets.COMMAND_RETRY)) { command.setValueMatcher(command.getValueMatcher().matcherForRetry()); } return invokeNext(context, command).thenAccept((rCtx, rCommand, rv) -> { final DataWriteCommand dwCommand = (DataWriteCommand) rCommand; final CommandInvocationId id = dwCommand.getCommandInvocationId(); if (!dwCommand.isSuccessful()) { if (trace) { log.tracef("Command %s not successful in primary owner.", id); } return; } if (distributionInfo.owners().size() > 1) { Collection
backupOwners = distributionInfo.backups(); if (rCtx.isOriginLocal() && (isSynchronous(dwCommand) || dwCommand.isReturnValueExpected())) { commandAckCollector.create(id, rv, distributionInfo.owners(), dwCommand.getTopologyId()); //check the topology after registering the collector. //if we don't, the collector may wait forever (==timeout) for non-existing acknowledges. checkTopologyId(dwCommand); } if (trace) { log.tracef("Command %s send to backup owner %s.", dwCommand.getCommandInvocationId(), backupOwners); } long sequenceNumber = triangleOrderManager.next(distributionInfo.getSegmentId(), dwCommand.getTopologyId()); BackupWriteRcpCommand backupWriteRcpCommand = commandsFactory.buildBackupWriteRcpCommand(dwCommand); backupWriteRcpCommand.setSequence(sequenceNumber); if (trace) { logCommandSequence(id, distributionInfo.getSegmentId(), sequenceNumber); } // we must send the message only after the collector is registered in the map rpcManager.sendToMany(backupOwners, backupWriteRcpCommand, DeliverOrder.NONE); } }); } private void logCommandSequence(CommandInvocationId id, int segment, long sequence) { log.tracef("Command %s got sequence %s for segment %s", id, sequence, segment); } private BasicInvocationStage localWriteInvocation(InvocationContext context, DataWriteCommand command, DistributionInfo distributionInfo) { assert context.isOriginLocal(); final CommandInvocationId invocationId = command.getCommandInvocationId(); if ((isSynchronous(command) || command.isReturnValueExpected()) && !command.hasAnyFlag(FlagBitSets.PUT_FOR_EXTERNAL_READ)) { commandAckCollector.create(invocationId, distributionInfo.owners(), command.getTopologyId()); } if (command.hasAnyFlag(FlagBitSets.COMMAND_RETRY)) { command.setValueMatcher(command.getValueMatcher().matcherForRetry()); } rpcManager.sendTo(distributionInfo.primary(), command, DeliverOrder.NONE); return returnWith(null); } /** * Classifies the keys by primary owner (address => keys & segments) and backup owners (address => segments). *

* The first map is used to forward the command to the primary owner with the subset of keys. *

* The second map is used to initialize the {@link CommandAckCollector} to wait for the backups acknowledges. */ private static class PrimaryOwnerClassifier { private final Map> backups = new HashMap<>(); private final Map> primaries = new HashMap<>(); private final ConsistentHash consistentHash; private PrimaryOwnerClassifier(ConsistentHash consistentHash) { this.consistentHash = consistentHash; } public void add(Map.Entry entry) { final int segment = consistentHash.getSegment(entry.getKey()); final Iterator

iterator = consistentHash.locateOwnersForSegment(segment).iterator(); final Address primaryOwner = iterator.next(); primaries.computeIfAbsent(primaryOwner, address -> new HashMap<>()).put(entry.getKey(), entry.getValue()); while (iterator.hasNext()) { Address backup = iterator.next(); backups.computeIfAbsent(backup, address -> new HashSet<>()).add(segment); } } } /** * A classifier used in the primary owner when handles a remote {@link PutMapCommand}. *

* It maps the backup owner address to the subset of keys. */ private static class BackupOwnerClassifier { private final Map> perSegmentKeyValue = new HashMap<>(); private final ConsistentHash consistentHash; private BackupOwnerClassifier(ConsistentHash consistentHash) { this.consistentHash = consistentHash; } public void add(Map.Entry entry) { perSegmentKeyValue.computeIfAbsent(consistentHash.getSegment(entry.getKey()), address -> new HashMap<>()) .put(entry.getKey(), entry.getValue()); } } }