
com.hazelcast.internal.partition.impl.InternalPartitionServiceImpl Maven / Gradle / Ivy
/*
* Copyright (c) 2008-2024, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.internal.partition.impl;
import com.hazelcast.cluster.Address;
import com.hazelcast.cluster.ClusterState;
import com.hazelcast.cluster.Member;
import com.hazelcast.core.HazelcastInstanceNotActiveException;
import com.hazelcast.core.MemberLeftException;
import com.hazelcast.instance.impl.Node;
import com.hazelcast.internal.cluster.ClusterStateListener;
import com.hazelcast.internal.cluster.impl.ClusterServiceImpl;
import com.hazelcast.internal.cluster.impl.operations.TriggerMemberListPublishOp;
import com.hazelcast.internal.metrics.MetricsRegistry;
import com.hazelcast.internal.metrics.Probe;
import com.hazelcast.internal.partition.IPartition;
import com.hazelcast.internal.partition.IPartitionLostEvent;
import com.hazelcast.internal.partition.InternalPartition;
import com.hazelcast.internal.partition.InternalPartitionService;
import com.hazelcast.internal.partition.MigrationInfo;
import com.hazelcast.internal.partition.MigrationInfo.MigrationStatus;
import com.hazelcast.internal.partition.PartitionAwareService;
import com.hazelcast.internal.partition.PartitionEventListener;
import com.hazelcast.internal.partition.PartitionReplica;
import com.hazelcast.internal.partition.PartitionReplicaVersionManager;
import com.hazelcast.internal.partition.PartitionRuntimeState;
import com.hazelcast.internal.partition.PartitionServiceProxy;
import com.hazelcast.internal.partition.PartitionTableView;
import com.hazelcast.internal.partition.ReadonlyInternalPartition;
import com.hazelcast.internal.partition.operation.AssignPartitions;
import com.hazelcast.internal.partition.operation.DemoteRequestOperation;
import com.hazelcast.internal.partition.operation.FetchPartitionStateOperation;
import com.hazelcast.internal.partition.operation.PartitionStateCheckOperation;
import com.hazelcast.internal.partition.operation.PartitionStateOperation;
import com.hazelcast.internal.partition.operation.ShutdownRequestOperation;
import com.hazelcast.internal.serialization.Data;
import com.hazelcast.internal.util.ExceptionUtil;
import com.hazelcast.internal.util.HashUtil;
import com.hazelcast.internal.util.StringUtil;
import com.hazelcast.internal.util.collection.PartitionIdSet;
import com.hazelcast.internal.util.scheduler.CoalescingDelayedTrigger;
import com.hazelcast.internal.util.scheduler.ScheduledEntry;
import com.hazelcast.logging.ILogger;
import com.hazelcast.partition.MigrationListener;
import com.hazelcast.partition.NoDataMemberInClusterException;
import com.hazelcast.partition.PartitionEvent;
import com.hazelcast.partition.PartitionLostListener;
import com.hazelcast.spi.exception.TargetNotMemberException;
import com.hazelcast.spi.impl.NodeEngine;
import com.hazelcast.spi.impl.NodeEngineImpl;
import com.hazelcast.spi.impl.eventservice.EventPublishingService;
import com.hazelcast.spi.impl.executionservice.ExecutionService;
import com.hazelcast.spi.impl.operationexecutor.OperationExecutor;
import com.hazelcast.spi.impl.operationservice.OperationService;
import com.hazelcast.spi.impl.operationservice.UrgentSystemOperation;
import com.hazelcast.spi.impl.operationservice.impl.InvocationFuture;
import com.hazelcast.spi.impl.operationservice.impl.OperationServiceImpl;
import com.hazelcast.spi.properties.ClusterProperty;
import com.hazelcast.spi.properties.HazelcastProperties;
import javax.annotation.Nonnull;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.Executor;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.logging.Level;
import static com.hazelcast.cluster.memberselector.MemberSelectors.DATA_MEMBER_SELECTOR;
import static com.hazelcast.cluster.memberselector.MemberSelectors.NON_LOCAL_MEMBER_SELECTOR;
import static com.hazelcast.internal.metrics.MetricDescriptorConstants.PARTITIONS_METRIC_PARTITION_SERVICE_MAX_BACKUP_COUNT;
import static com.hazelcast.internal.metrics.MetricDescriptorConstants.PARTITIONS_METRIC_PARTITION_SERVICE_MIGRATION_QUEUE_SIZE;
import static com.hazelcast.internal.metrics.MetricDescriptorConstants.PARTITIONS_PREFIX;
import static com.hazelcast.internal.partition.PartitionStampUtil.calculateStamp;
import static com.hazelcast.internal.partition.impl.MigrationManagerImpl.applyMigration;
import static com.hazelcast.internal.util.MapUtil.createHashMap;
import static java.lang.Math.ceil;
import static java.lang.Math.max;
import static java.lang.Math.min;
import static java.lang.Thread.currentThread;
import static java.util.concurrent.TimeUnit.SECONDS;
/**
* The {@link InternalPartitionService} implementation.
*/
@SuppressWarnings({"checkstyle:methodcount", "checkstyle:classfanoutcomplexity", "checkstyle:classdataabstractioncoupling"})
public class InternalPartitionServiceImpl implements InternalPartitionService,
EventPublishingService>,
PartitionAwareService, ClusterStateListener {
private static final int PARTITION_OWNERSHIP_WAIT_MILLIS = 10;
private static final int SAFE_SHUTDOWN_MAX_AWAIT_STEP_MILLIS = 1000;
private static final int DEMOTE_MAX_AWAIT_STEP_MILLIS = 5000;
private static final long FETCH_PARTITION_STATE_SECONDS = 5;
private static final long TRIGGER_MASTER_DELAY_MILLIS = 1000;
private final Node node;
private final NodeEngineImpl nodeEngine;
private final ILogger logger;
private final int partitionCount;
private final long partitionMigrationTimeout;
private final PartitionServiceProxy proxy;
private final Lock partitionServiceLock = new ReentrantLock();
private final PartitionStateManager partitionStateManager;
private final MigrationManager migrationManager;
private final PartitionReplicaManager replicaManager;
private final PartitionReplicaStateChecker partitionReplicaStateChecker;
private final PartitionEventManager partitionEventManager;
/** Determines if a {@link AssignPartitions} is being sent to the master, used to limit partition assignment requests. */
private final AtomicBoolean masterTriggered = new AtomicBoolean(false);
private final CoalescingDelayedTrigger masterTrigger;
private final AtomicReference shutdownLatchRef = new AtomicReference<>();
private final AtomicReference demoteLatchRef = new AtomicReference<>();
private final Executor internalAsyncExecutor;
private volatile Address latestMaster;
/** Whether the master should fetch the partition tables from other nodes, can happen when node becomes new master. */
private volatile boolean shouldFetchPartitionTables;
public InternalPartitionServiceImpl(Node node) {
HazelcastProperties properties = node.getProperties();
this.partitionCount = properties.getInteger(ClusterProperty.PARTITION_COUNT);
this.node = node;
this.nodeEngine = node.nodeEngine;
this.logger = node.getLogger(InternalPartitionService.class);
this.internalAsyncExecutor = nodeEngine.getExecutionService()
.getExecutor(ExecutionService.ASYNC_EXECUTOR);
partitionStateManager = new PartitionStateManagerImpl(node, this);
migrationManager = new MigrationManagerImpl(node, this, partitionServiceLock);
replicaManager = new PartitionReplicaManager(node, this);
partitionReplicaStateChecker = new PartitionReplicaStateChecker(node.getNodeEngine(), this);
partitionEventManager = new PartitionEventManager(node);
masterTrigger = new CoalescingDelayedTrigger(nodeEngine.getExecutionService(), TRIGGER_MASTER_DELAY_MILLIS,
2 * TRIGGER_MASTER_DELAY_MILLIS, this::resetMasterTriggeredFlag);
partitionMigrationTimeout = properties.getMillis(ClusterProperty.PARTITION_MIGRATION_TIMEOUT);
proxy = new PartitionServiceProxy(nodeEngine, this);
MetricsRegistry metricsRegistry = nodeEngine.getMetricsRegistry();
metricsRegistry.registerStaticMetrics(this, PARTITIONS_PREFIX);
metricsRegistry.registerStaticMetrics(partitionStateManager, PARTITIONS_PREFIX);
metricsRegistry.registerStaticMetrics(migrationManager, PARTITIONS_PREFIX);
metricsRegistry.registerStaticMetrics(replicaManager, PARTITIONS_PREFIX);
}
@Override
public void init(NodeEngine nodeEngine, Properties properties) {
int partitionTableSendInterval = node.getProperties().getSeconds(ClusterProperty.PARTITION_TABLE_SEND_INTERVAL);
if (partitionTableSendInterval <= 0) {
partitionTableSendInterval = 1;
}
ExecutionService executionService = nodeEngine.getExecutionService();
executionService.scheduleWithRepetition(new PublishPartitionRuntimeStateTask(node, this),
partitionTableSendInterval, partitionTableSendInterval, SECONDS);
migrationManager.start();
replicaManager.scheduleReplicaVersionSync(executionService);
}
@Override
public Address getPartitionOwner(int partitionId) {
if (!partitionStateManager.isInitialized()) {
firstArrangement();
}
final InternalPartition partition = partitionStateManager.getPartitionImpl(partitionId);
if (partition.getOwnerReplicaOrNull() == null && !node.isMaster()) {
if (!isClusterFormedByOnlyLiteMembers()) {
triggerMasterToAssignPartitions();
}
}
return partition.getOwnerOrNull();
}
@Override
public Address getPartitionOwnerOrWait(int partitionId) {
Address owner;
while ((owner = getPartitionOwner(partitionId)) == null) {
if (!nodeEngine.isRunning()) {
throw new HazelcastInstanceNotActiveException();
}
ClusterState clusterState = node.getClusterService().getClusterState();
if (!clusterState.isMigrationAllowed()) {
throw new IllegalStateException("Partitions can't be assigned since cluster-state: " + clusterState);
}
if (isClusterFormedByOnlyLiteMembers()) {
throw new NoDataMemberInClusterException(
"Partitions can't be assigned since all nodes in the cluster are lite members");
}
try {
Thread.sleep(PARTITION_OWNERSHIP_WAIT_MILLIS);
} catch (InterruptedException e) {
currentThread().interrupt();
throw ExceptionUtil.rethrow(e);
}
}
return owner;
}
@Override
public PartitionRuntimeState firstArrangement() {
if (!isLocalMemberMaster()) {
triggerMasterToAssignPartitions();
return null;
}
try {
if (!partitionServiceLock.tryLock(PARTITION_OWNERSHIP_WAIT_MILLIS, TimeUnit.MILLISECONDS)) {
return null;
}
} catch (InterruptedException e) {
currentThread().interrupt();
return null;
}
try {
if (!partitionStateManager.isInitialized()) {
Set excludedMembers = migrationManager.getDataDisownRequestedMembers();
if (partitionStateManager.initializePartitionAssignments(excludedMembers)) {
publishPartitionRuntimeState();
}
}
return createPartitionStateInternal();
} finally {
partitionServiceLock.unlock();
}
}
@Override
public boolean isPartitionAssignmentDone() {
return partitionStateManager.isInitialized();
}
/** Sends a {@link AssignPartitions} to the master to assign partitions. */
private void triggerMasterToAssignPartitions() {
if (!shouldTriggerMasterToAssignPartitions()) {
return;
}
ClusterServiceImpl clusterService = node.getClusterService();
ClusterState clusterState = clusterService.getClusterState();
if (!clusterState.isMigrationAllowed()) {
logger.warning("Partitions can't be assigned since cluster-state=" + clusterState);
return;
}
final Address masterAddress = latestMaster;
if (masterAddress == null || masterAddress.equals(node.getThisAddress())) {
return;
}
if (masterTriggered.compareAndSet(false, true)) {
OperationServiceImpl operationService = nodeEngine.getOperationService();
InvocationFuture future =
operationService.invokeOnTarget(SERVICE_NAME, new AssignPartitions(), masterAddress);
future.whenCompleteAsync((partitionState, throwable) -> {
if (throwable == null) {
resetMasterTriggeredFlag();
if (partitionState != null) {
partitionState.setMaster(masterAddress);
processPartitionRuntimeState(partitionState);
}
} else {
resetMasterTriggeredFlag();
logger.severe(throwable);
}
}, internalAsyncExecutor);
masterTrigger.executeWithDelay();
}
}
private boolean shouldTriggerMasterToAssignPartitions() {
ClusterServiceImpl clusterService = node.getClusterService();
return !partitionStateManager.isInitialized() && clusterService.isJoined() && node.getNodeExtension().isStartCompleted();
}
private void resetMasterTriggeredFlag() {
masterTriggered.set(false);
}
private boolean isClusterFormedByOnlyLiteMembers() {
final ClusterServiceImpl clusterService = node.getClusterService();
return clusterService.getMembers(DATA_MEMBER_SELECTOR).isEmpty();
}
/**
* Sets the initial partition table and state version. If any partition has a replica, the partition state manager is
* set to initialized, otherwise {@link PartitionStateManager#isInitialized()} stays uninitialized but the current state
* will be updated nevertheless.
* This method acquires the partition service lock.
*
* @param partitionTable the initial partition table
* @throws IllegalStateException if the partition manager has already been initialized
*/
public void setInitialState(PartitionTableView partitionTable) {
partitionServiceLock.lock();
try {
partitionStateManager.setInitialState(partitionTable);
} finally {
partitionServiceLock.unlock();
}
}
@Override
public int getMemberGroupsSize() {
return partitionStateManager.getMemberGroupsSize();
}
@Probe(name = PARTITIONS_METRIC_PARTITION_SERVICE_MAX_BACKUP_COUNT)
@Override
public int getMaxAllowedBackupCount() {
return max(min(getMemberGroupsSize() - 1, InternalPartition.MAX_BACKUP_COUNT), 0);
}
public void updateMemberGroupSize() {
partitionStateManager.updateMemberGroupsSize();
}
@Override
public void memberAdded(Member member) {
logger.fine("Adding " + member);
partitionServiceLock.lock();
try {
latestMaster = node.getClusterService().getMasterAddress();
if (!member.localMember()) {
partitionStateManager.updateMemberGroupsSize();
}
if (isLocalMemberMaster()) {
if (partitionStateManager.isInitialized()) {
migrationManager.triggerControlTask();
}
}
} finally {
partitionServiceLock.unlock();
}
}
@Override
public void memberRemoved(Member... members) {
if (members.length == 0) {
return;
}
logger.fine("Removing " + Arrays.toString(members));
partitionServiceLock.lock();
try {
ClusterState clusterState = node.getClusterService().getClusterState();
for (Member member : members) {
migrationManager.onMemberRemove(member);
replicaManager.cancelReplicaSyncRequestsTo(member);
Address formerMaster = latestMaster;
latestMaster = node.getClusterService().getMasterAddress();
if (clusterState.isMigrationAllowed() || clusterState.isPartitionPromotionAllowed()) {
partitionStateManager.updateMemberGroupsSize();
boolean isThisNodeNewMaster = node.isMaster() && !node.getThisAddress().equals(formerMaster);
if (isThisNodeNewMaster) {
assert !shouldFetchPartitionTables;
shouldFetchPartitionTables = true;
}
// keep partition table snapshot as member leaves, unless
// no partitions were assigned to it (member left with graceful shutdown)
if (!partitionStateManager.isAbsentInPartitionTable(member)) {
partitionStateManager.storeSnapshot(member.getUuid());
} else {
// member is removed due to graceful shutdown
// cleanup any leftover snapshots
partitionStateManager.removeSnapshot(member.getUuid());
}
}
}
// the following call should be made outside the loop, otherwise if node.isMaster() == true
// current node might store partition assignment snapshot for some members but not the others,
// since migrationManager.triggerControlTaskWithDelay() might remove unknown members from
// the partition table before the above logic is executed for another removed member.
if (node.isMaster()
&& (clusterState.isMigrationAllowed() || clusterState.isPartitionPromotionAllowed())) {
migrationManager.triggerControlTaskWithDelay();
}
} finally {
partitionServiceLock.unlock();
}
}
@Override
public void onClusterStateChange(ClusterState newState) {
if (!newState.isMigrationAllowed()) {
return;
}
if (!partitionStateManager.isInitialized()) {
return;
}
if (!isLocalMemberMaster()) {
return;
}
partitionServiceLock.lock();
try {
if (partitionStateManager.isInitialized()
&& migrationManager.shouldTriggerRepartitioningWhenClusterStateAllowsMigration()) {
migrationManager.triggerControlTask();
}
} finally {
partitionServiceLock.unlock();
}
}
@Override
public PartitionRuntimeState createPartitionState() {
if (!isFetchMostRecentPartitionTableTaskRequired()) {
return createPartitionStateInternal();
}
return null;
}
/**
* Returns a copy of the partition table or {@code null} if not initialized. This method will acquire the partition service
* lock.
*/
public PartitionRuntimeState createPartitionStateInternal() {
partitionServiceLock.lock();
try {
if (!partitionStateManager.isInitialized()) {
return null;
}
List completedMigrations = migrationManager.getCompletedMigrationsCopy();
InternalPartition[] partitions = partitionStateManager.getPartitions();
PartitionRuntimeState state;
long stamp = partitionStateManager.getStamp();
assert calculateStamp(partitions) == stamp : "Invalid partition stamp! Expected: "
+ calculateStamp(partitions) + ", Actual: " + stamp;
state = new PartitionRuntimeState(partitions, completedMigrations, stamp);
// Create copy of active migrations set.
// Because original set can be updated concurrently behind the scenes.
Collection activeMigrations = new ArrayList<>(migrationManager.getActiveMigrations());
state.setActiveMigrations(activeMigrations);
return state;
} finally {
partitionServiceLock.unlock();
}
}
/**
* Creates a transient {@link PartitionRuntimeState} to commit promotions by applying the {@code migrationInfos}.
* The partition table version is incremented by number of promotions.
* This method will acquire the partition service lock.
*
* @param migrationInfos the promotions to be executed on the destination
* @return the partition table with the executed migrations or {@code null} if the partitions are not initialized (assigned)
*/
PartitionRuntimeState createPromotionCommitPartitionState(Collection migrationInfos) {
partitionServiceLock.lock();
try {
if (!partitionStateManager.isInitialized()) {
return null;
}
List completedMigrations = migrationManager.getCompletedMigrationsCopy();
InternalPartition[] partitions = partitionStateManager.getPartitionsCopy(false);
for (MigrationInfo migrationInfo : migrationInfos) {
int partitionId = migrationInfo.getPartitionId();
InternalPartitionImpl partition = (InternalPartitionImpl) partitions[partitionId];
applyMigration(partition, migrationInfo);
migrationInfo.setStatus(MigrationStatus.SUCCESS);
}
long stamp = calculateStamp(partitions);
return new PartitionRuntimeState(partitions, completedMigrations, stamp);
} finally {
partitionServiceLock.unlock();
}
}
/**
* Called on the master node to publish the current partition state to all cluster nodes. It will not publish the partition
* state if the partitions have not yet been initialized, there is ongoing repartitioning or a node is joining the cluster.
*/
@SuppressWarnings("checkstyle:npathcomplexity")
void publishPartitionRuntimeState() {
if (!partitionStateManager.isInitialized()) {
// do not send partition state until initialized!
return;
}
if (!isLocalMemberMaster()) {
return;
}
if (!areMigrationTasksAllowed()) {
// migration is disabled because of a member leave, wait till enabled!
return;
}
PartitionRuntimeState partitionState = createPartitionStateInternal();
if (partitionState == null) {
return;
}
if (logger.isFineEnabled()) {
logger.fine("Publishing partition state, stamp: " + partitionState.getStamp());
}
PartitionStateOperation op = new PartitionStateOperation(partitionState, false);
OperationService operationService = nodeEngine.getOperationService();
Collection members = node.clusterService.getMembers();
for (Member member : members) {
if (!member.localMember()) {
try {
operationService.send(op, member.getAddress());
} catch (Exception e) {
logger.finest(e);
}
}
}
}
CompletableFuture sendPartitionRuntimeState(Address target) {
if (!isLocalMemberMaster()) {
return CompletableFuture.completedFuture(false);
}
assert partitionStateManager.isInitialized();
PartitionRuntimeState partitionState = createPartitionStateInternal();
assert partitionState != null;
if (logger.isFineEnabled()) {
logger.fine("Sending partition state, stamp: " + partitionState.getStamp() + ", to " + target);
}
OperationService operationService = nodeEngine.getOperationService();
PartitionStateOperation op = new PartitionStateOperation(partitionState, true);
return operationService.invokeOnTarget(SERVICE_NAME, op, target);
}
List> checkClusterPartitionRuntimeStates() {
if (!partitionStateManager.isInitialized()) {
return Collections.emptyList();
}
if (!isLocalMemberMaster()) {
return Collections.emptyList();
}
if (!areMigrationTasksAllowed()) {
// migration is disabled because of a member leave, wait till enabled!
return Collections.emptyList();
}
long stamp = getPartitionStateStamp();
if (logger.isFineEnabled()) {
logger.fine("Checking partition state, stamp: " + stamp);
}
List> futures = new ArrayList<>();
OperationService operationService = nodeEngine.getOperationService();
Collection members = node.clusterService.getMembers();
for (Member member : members) {
if (!member.localMember()) {
PartitionStateCheckOperation op = new PartitionStateCheckOperation(stamp);
InvocationFuture future = operationService.invokeOnTarget(SERVICE_NAME, op, member.getAddress());
futures.add(future.handleAsync((response, throwable) -> {
if (throwable == null) {
if (!Boolean.TRUE.equals(response)) {
logger.fine(member + " has a stale partition state. Will send the most recent "
+ "partition state now.");
return sendPartitionRuntimeState(member.getAddress());
}
return CompletableFuture.completedFuture(true);
} else {
logger.fine("Failure while checking partition state on " + member, throwable);
return sendPartitionRuntimeState(member.getAddress());
}
}, internalAsyncExecutor)
.thenComposeAsync(x -> x, internalAsyncExecutor));
}
}
return futures;
}
/**
* Sets the {@code partitionState} if the node is started and the state is sent by the master known by this node.
*
* @param partitionState the new partition state
* @return {@code true} if the partition state was applied
*/
public boolean processPartitionRuntimeState(PartitionRuntimeState partitionState) {
if (!node.getNodeExtension().isStartCompleted()) {
logger.warning("Ignoring received partition table, startup is not completed yet. Sender: "
+ partitionState.getMaster());
return false;
}
return applyPartitionRuntimeState(partitionState);
}
public boolean applyPartitionRuntimeState(PartitionRuntimeState partitionState) {
Address sender = partitionState.getMaster();
if (!validateSenderIsMaster(sender, "partition table update")) {
return false;
}
assert calculateStamp(partitionState.getPartitions()) == partitionState.getStamp()
: "Invalid partition stamp! Expected: " + calculateStamp(partitionState.getPartitions())
+ ", Actual: " + partitionState.getStamp();
return applyNewPartitionTable(partitionState.getPartitions(), partitionState.getCompletedMigrations(), sender);
}
private boolean validateSenderIsMaster(Address sender, String messageType) {
Address thisAddress = node.getThisAddress();
if (thisAddress.equals(latestMaster) && !thisAddress.equals(sender)) {
logger.warning("This is the master node and received " + messageType + " from " + sender
+ ". Ignoring incoming state! ");
return false;
} else {
if (!isMemberMaster(sender)) {
if (node.clusterService.getMember(sender) == null) {
logger.severe("Received " + messageType + " from an unknown member!"
+ " => Sender: " + sender + "! ");
} else {
logger.warning("Received " + messageType + ", but its sender doesn't seem to be master!"
+ " => Sender: " + sender + "! (Ignore if master node has changed recently.)");
}
return false;
}
}
return true;
}
/**
* Applies the {@code partitionState} sent by the {@code sender} if the new state is newer than the current one
* and finalizes the migrations.
* This method does not validate the sender. It is caller method's responsibility.
* This method will acquire the partition service lock.
*
* @param partitions the new partition table
* @param completedMigrations latest completed migrations
* @param sender the sender of the new partition state
* @return {@code true} if the partition state version is higher than the current one and was applied or
* if the partition state version is same as the current one
*/
private boolean applyNewPartitionTable(InternalPartition[] partitions, Collection completedMigrations,
Address sender) {
partitionServiceLock.lock();
try {
requestMemberListUpdateIfUnknownMembersFound(sender, partitions);
return updatePartitionsAndFinalizeMigrations(partitions, completedMigrations, sender);
} finally {
partitionServiceLock.unlock();
}
}
private void requestMemberListUpdateIfUnknownMembersFound(Address sender, InternalPartition[] partitions) {
ClusterServiceImpl clusterService = node.clusterService;
ClusterState clusterState = clusterService.getClusterState();
Set unknownReplicas = new HashSet<>();
for (InternalPartition partition : partitions) {
for (int index = 0; index < InternalPartition.MAX_REPLICA_COUNT; index++) {
PartitionReplica replica = partition.getReplica(index);
if (replica == null) {
continue;
}
if (node.clusterService.getMember(replica.address(), replica.uuid()) == null
&& (clusterState.isJoinAllowed()
|| !clusterService.isMissingMember(replica.address(), replica.uuid()))) {
unknownReplicas.add(replica);
}
}
}
if (!unknownReplicas.isEmpty()) {
if (logger.isWarningEnabled()) {
StringBuilder s = new StringBuilder("Following unknown addresses are found in partition table")
.append(" sent from master[").append(sender).append("].")
.append(" (Probably they have recently joined or left the cluster.)")
.append(" {");
for (PartitionReplica replica : unknownReplicas) {
s.append("\n\t").append(replica);
}
s.append("\n}");
logger.warning(s.toString());
}
Address masterAddress = node.getClusterService().getMasterAddress();
// If node is shutting down, master can be null.
if (masterAddress != null && !masterAddress.equals(node.getThisAddress())) {
// unknown addresses found in partition table, request a new member-list from master
nodeEngine.getOperationService().send(new TriggerMemberListPublishOp(), masterAddress);
}
}
}
/**
* Updates all partitions and version, updates (adds and retains) the completed migrations and finalizes the active
* migration if it is equal to any completed.
*
* @param partitions new partition table
* @param completedMigrations completed migrations
* @param sender the sender of the new partition state
* @see MigrationManager#scheduleActiveMigrationFinalization(MigrationInfo)
* @return whether or not the new partition table is accepted
*/
@SuppressWarnings({"checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity"})
private boolean updatePartitionsAndFinalizeMigrations(InternalPartition[] partitions,
Collection completedMigrations, Address sender) {
boolean applied = false;
boolean accepted = false;
PartitionIdSet changedOwnerPartitions = new PartitionIdSet(partitionCount);
for (int partitionId = 0; partitionId < partitionCount; partitionId++) {
InternalPartition newPartition = partitions[partitionId];
InternalPartitionImpl currentPartition = partitionStateManager.getPartitionImpl(partitionId);
int currentVersion = currentPartition.version();
int newVersion = newPartition.version();
if (newVersion < currentVersion) {
if (logger.isFinestEnabled()) {
logger.finest("Already applied partition update. partitionId=" + partitionId
+ ", local version: " + currentVersion
+ ", master version: " + newVersion + ", master: " + sender);
}
continue;
} else if (newVersion == currentVersion) {
if (!currentPartition.equals(newPartition)) {
throw new IllegalStateException("Partition updates are diverged! Local: " + currentPartition
+ ", Received: " + newPartition);
}
if (logger.isFinestEnabled()) {
logger.finest("Already applied partition update. partitionId=" + partitionId
+ ", version: " + currentVersion + ", master: " + sender);
}
accepted = true;
continue;
}
applied = true;
accepted = true;
if (currentPartition.setReplicasAndVersion(newPartition)) {
changedOwnerPartitions.add(partitionId);
}
}
for (MigrationInfo migration : completedMigrations) {
boolean added = migrationManager.addCompletedMigration(migration);
if (added) {
migrationManager.scheduleActiveMigrationFinalization(migration);
}
}
// Because partition versions are explicitly set to master's versions
// while applying the partition table updates, we need to
// (1) cancel any ongoing replica syncs (for partitions whose owners changed) and
// (2) update the partition state stamp.
partitionStateManager.partitionOwnersChanged(changedOwnerPartitions);
if (logger.isFineEnabled()) {
if (applied) {
logger.fine("Applied partition state update with stamp: " + calculateStamp(partitions)
+ ", Local stamp is: " + partitionStateManager.getStamp());
} else {
logger.fine("Already applied partition state update with stamp: " + calculateStamp(partitions)
+ ", Local stamp is: " + partitionStateManager.getStamp());
}
}
migrationManager.retainCompletedMigrations(completedMigrations);
if (!partitionStateManager.setInitialized()) {
node.getNodeExtension().onPartitionStateChange();
}
return accepted;
}
@SuppressWarnings({"checkstyle:cyclomaticcomplexity", "checkstyle:npathcomplexity"})
public boolean applyCompletedMigrations(Collection migrations, Address sender) {
if (!validateSenderIsMaster(sender, "completed migrations")) {
return false;
}
partitionServiceLock.lock();
try {
if (!partitionStateManager.isInitialized()) {
if (logger.isFineEnabled()) {
logger.fine("Cannot apply completed migrations until partition table is initialized. "
+ "Completed migrations: " + migrations);
}
return false;
}
if (isLocalMemberMaster()) {
return true;
}
boolean appliedAllMigrations = true;
for (MigrationInfo migration : migrations) {
InternalPartitionImpl partition = partitionStateManager.getPartitionImpl(migration.getPartitionId());
int currentVersion = partition.version();
if (migration.getFinalPartitionVersion() <= currentVersion) {
if (logger.isFinestEnabled()) {
logger.finest("Already applied " + migration + ". Local version: " + currentVersion
+ ", Commit version: " + migration.getFinalPartitionVersion() + " Master: " + sender);
}
continue;
}
if (migration.getInitialPartitionVersion() != currentVersion) {
logger.warning("Cannot apply migration commit: " + migration + ". Expected version: "
+ migration.getInitialPartitionVersion() + ", current version: " + currentVersion
+ ", final version: " + migration.getFinalPartitionVersion() + ", Master: " + sender);
appliedAllMigrations = false;
break;
}
boolean added = migrationManager.addCompletedMigration(migration);
assert added : "Migration: " + migration;
if (migration.getStatus() == MigrationStatus.SUCCESS) {
if (logger.isFineEnabled()) {
logger.fine("Applying completed migration " + migration);
}
applyMigration(partition, migration);
} else {
// migration failed...
int increment = migration.getPartitionVersionIncrement();
partitionStateManager.incrementPartitionVersion(partition.getPartitionId(), increment);
}
migrationManager.scheduleActiveMigrationFinalization(migration);
}
if (logger.isFineEnabled()) {
logger.fine("Applied completed migrations with partition state stamp: " + partitionStateManager.getStamp());
}
migrationManager.retainCompletedMigrations(migrations);
node.getNodeExtension().onPartitionStateChange();
return appliedAllMigrations;
} finally {
partitionServiceLock.unlock();
}
}
@Override
public IPartition[] getPartitions() {
IPartition[] result = new IPartition[partitionCount];
System.arraycopy(partitionStateManager.getPartitions(), 0, result, 0, partitionCount);
return result;
}
@Override
public InternalPartition[] getInternalPartitions() {
return partitionStateManager.getPartitions();
}
@Override
public InternalPartition getPartition(int partitionId) {
return getPartition(partitionId, true);
}
@Override
public InternalPartition getPartition(int partitionId, boolean triggerOwnerAssignment) {
InternalPartitionImpl p = partitionStateManager.getPartitionImpl(partitionId);
if (triggerOwnerAssignment && p.getOwnerReplicaOrNull() == null) {
// probably ownerships are not set yet.
// force it.
getPartitionOwner(partitionId);
}
return p;
}
@Override
public boolean onDemote(long timeout, TimeUnit unit) {
if (node.isLiteMember()) {
return true;
}
CountDownLatch latch = getDemoteLatch();
OperationServiceImpl operationService = nodeEngine.getOperationService();
long timeoutMillis = unit.toMillis(timeout);
long awaitStep = Math.min(DEMOTE_MAX_AWAIT_STEP_MILLIS, timeoutMillis);
try {
boolean shouldRetry;
do {
Address masterAddress = nodeEngine.getMasterAddress();
if (masterAddress == null) {
logger.warning("Member demote failed, master member is not known!");
return false;
}
shouldRetry = false;
boolean demoteInitiated;
if (node.isMaster()) {
demoteInitiated = onDemoteRequest(node.getLocalMember());
} else {
UUID memberUuid = node.getLocalMember().getUuid();
DemoteRequestOperation op = new DemoteRequestOperation(memberUuid);
InvocationFuture demotionInitResult = operationService.invokeOnMaster(op.getServiceName(),
op);
try {
long startTime = System.currentTimeMillis();
demoteInitiated = demotionInitResult.get(awaitStep, TimeUnit.MILLISECONDS);
long elapsed = System.currentTimeMillis() - startTime;
timeoutMillis -= elapsed;
} catch (TimeoutException e) {
// allow the loop to continue with another demote attempt
demoteInitiated = false;
shouldRetry = true;
} catch (Exception e) {
logger.warning("Failed to initialize member demotion", e);
return false;
}
}
long latchTimeout = Math.min(awaitStep, timeoutMillis);
if (demoteInitiated) {
if (latch.await(latchTimeout, TimeUnit.MILLISECONDS)) {
return true;
} else {
timeoutMillis -= latchTimeout;
shouldRetry = true;
}
}
} while (timeoutMillis > 0 && shouldRetry);
} catch (InterruptedException e) {
currentThread().interrupt();
logger.info("Member demote is interrupted!");
}
return false;
}
@Override
public boolean onShutdown(long timeout, TimeUnit unit) {
if (!node.getClusterService().isJoined()) {
return true;
}
if (node.isLiteMember()) {
return true;
}
CountDownLatch latch = getShutdownLatch();
OperationServiceImpl operationService = nodeEngine.getOperationService();
long timeoutMillis = unit.toMillis(timeout);
long awaitStep = Math.min(SAFE_SHUTDOWN_MAX_AWAIT_STEP_MILLIS, timeoutMillis);
try {
do {
Address masterAddress = nodeEngine.getMasterAddress();
if (masterAddress == null) {
logger.warning("Safe shutdown failed, master member is not known!");
return false;
}
if (node.getThisAddress().equals(masterAddress)) {
onShutdownRequest(node.getLocalMember());
} else {
UUID memberUuid = node.getLocalMember().getUuid();
operationService.send(new ShutdownRequestOperation(memberUuid), masterAddress);
}
if (latch.await(awaitStep, TimeUnit.MILLISECONDS)) {
return true;
}
timeoutMillis -= awaitStep;
} while (timeoutMillis > 0);
} catch (InterruptedException e) {
currentThread().interrupt();
logger.info("Safe shutdown is interrupted!");
}
return false;
}
private CountDownLatch getShutdownLatch() {
CountDownLatch latch = shutdownLatchRef.get();
if (latch == null) {
latch = new CountDownLatch(1);
if (!shutdownLatchRef.compareAndSet(null, latch)) {
latch = shutdownLatchRef.get();
}
}
return latch;
}
private CountDownLatch getDemoteLatch() {
CountDownLatch latch = demoteLatchRef.get();
if (latch == null) {
latch = new CountDownLatch(1);
if (!demoteLatchRef.compareAndSet(null, latch)) {
latch = demoteLatchRef.get();
}
}
return latch;
}
public boolean onDemoteRequest(Member member) {
partitionServiceLock.lock();
try {
return migrationManager.onDemoteRequest(member);
} finally {
partitionServiceLock.unlock();
}
}
public void onDemoteResponse() {
CountDownLatch latch = demoteLatchRef.get();
if (latch != null) {
latch.countDown();
demoteLatchRef.compareAndSet(latch, null);
}
}
public void onShutdownRequest(Member member) {
if (partitionServiceLock.tryLock()) {
try {
migrationManager.onShutdownRequest(member);
} finally {
partitionServiceLock.unlock();
}
}
}
public void onShutdownResponse() {
CountDownLatch latch = shutdownLatchRef.get();
assert latch != null;
latch.countDown();
}
@Override
public boolean isMemberStateSafe() {
return partitionReplicaStateChecker.getPartitionServiceState() == PartitionServiceState.SAFE;
}
@Override
public boolean isPartitionTableSafe() {
return partitionReplicaStateChecker.getPartitionTableState() == PartitionServiceState.SAFE;
}
@Override
public boolean hasOnGoingMigration() {
return hasOnGoingMigrationLocal()
|| (!isLocalMemberMaster() && partitionReplicaStateChecker.hasOnGoingMigrationMaster(Level.FINEST));
}
@Override
public boolean hasOnGoingMigrationLocal() {
return migrationManager.hasOnGoingMigration();
}
@Override
public final int getPartitionId(@Nonnull Data key) {
return HashUtil.hashToIndex(key.getPartitionHash(), partitionCount);
}
@Override
public final int getPartitionId(@Nonnull Object key) {
return getPartitionId(nodeEngine.toData(key));
}
@Override
public final int getPartitionCount() {
return partitionCount;
}
public long getPartitionMigrationTimeout() {
return partitionMigrationTimeout;
}
@Override
public PartitionReplicaVersionManager getPartitionReplicaVersionManager() {
return replicaManager;
}
@Override
public Map> getMemberPartitionsMap() {
Collection dataMembers = node.getClusterService().getMembers(DATA_MEMBER_SELECTOR);
int dataMembersSize = dataMembers.size();
int partitionsPerMember = (dataMembersSize > 0 ? (int) ceil((float) partitionCount / dataMembersSize) : 0);
Map> memberPartitions = createHashMap(dataMembersSize);
for (int partitionId = 0; partitionId < partitionCount; partitionId++) {
Address owner = getPartitionOwnerOrWait(partitionId);
List ownedPartitions =
memberPartitions.computeIfAbsent(owner, k -> new ArrayList<>(partitionsPerMember));
ownedPartitions.add(partitionId);
}
return memberPartitions;
}
@Override
public List getMemberPartitions(Address target) {
List ownedPartitions = new LinkedList<>();
for (int i = 0; i < partitionCount; i++) {
final Address owner = getPartitionOwner(i);
if (target.equals(owner)) {
ownedPartitions.add(i);
}
}
return ownedPartitions;
}
@Override
public List getMemberPartitionsIfAssigned(Address target) {
if (!partitionStateManager.isInitialized()) {
return Collections.emptyList();
}
return getMemberPartitions(target);
}
@Override
public void reset() {
partitionServiceLock.lock();
try {
shouldFetchPartitionTables = false;
replicaManager.reset();
partitionStateManager.reset();
migrationManager.reset();
} finally {
partitionServiceLock.unlock();
}
}
@Override
public void pauseMigration() {
migrationManager.pauseMigration();
}
@Override
public void resumeMigration() {
migrationManager.resumeMigration();
}
public boolean areMigrationTasksAllowed() {
return migrationManager.areMigrationTasksAllowed();
}
@Override
public void shutdown(boolean terminate) {
logger.finest("Shutting down the partition service");
migrationManager.stop();
reset();
}
@Probe(name = PARTITIONS_METRIC_PARTITION_SERVICE_MIGRATION_QUEUE_SIZE)
@Override
public long getMigrationQueueSize() {
return migrationManager.getMigrationQueueSize();
}
@Override
public PartitionServiceProxy getPartitionServiceProxy() {
return proxy;
}
@Override
public UUID addMigrationListener(MigrationListener listener) {
return partitionEventManager.addMigrationListener(listener);
}
@Override
public CompletableFuture addMigrationListenerAsync(MigrationListener migrationListener) {
return partitionEventManager.addMigrationListenerAsync(migrationListener);
}
@Override
public UUID addLocalMigrationListener(MigrationListener migrationListener) {
return partitionEventManager.addLocalMigrationListener(migrationListener);
}
@Override
public boolean removeMigrationListener(UUID registrationId) {
return partitionEventManager.removeMigrationListener(registrationId);
}
@Override
public CompletableFuture removeMigrationListenerAsync(UUID registrationId) {
return partitionEventManager.removeMigrationListenerAsync(registrationId);
}
@Override
public UUID addPartitionLostListener(PartitionLostListener listener) {
return partitionEventManager.addPartitionLostListener(listener);
}
@Override
public CompletableFuture addPartitionLostListenerAsync(PartitionLostListener listener) {
return partitionEventManager.addPartitionLostListenerAsync(listener);
}
@Override
public UUID addLocalPartitionLostListener(PartitionLostListener listener) {
return partitionEventManager.addLocalPartitionLostListener(listener);
}
@Override
public boolean removePartitionLostListener(UUID registrationId) {
return partitionEventManager.removePartitionLostListener(registrationId);
}
@Override
public CompletableFuture removePartitionLostListenerAsync(UUID registrationId) {
return partitionEventManager.removePartitionLostListenerAsync(registrationId);
}
@Override
public void dispatchEvent(PartitionEvent event, PartitionEventListener partitionEventListener) {
// Internal event not used for UCD, no Namespace awareness needed
partitionEventListener.onEvent(event);
}
@Override
public boolean isPartitionOwner(int partitionId) {
InternalPartition partition = partitionStateManager.getPartitionImpl(partitionId);
return partition.isLocal();
}
@Override
public long getPartitionStateStamp() {
return partitionStateManager.getStamp();
}
@Override
public void onPartitionLost(IPartitionLostEvent event) {
partitionEventManager.onPartitionLost(event);
}
public void setMigrationInterceptor(MigrationInterceptor listener) {
migrationManager.setMigrationInterceptor(listener);
}
public MigrationInterceptor getMigrationInterceptor() {
return migrationManager.getMigrationInterceptor();
}
public void resetMigrationInterceptor() {
migrationManager.resetMigrationInterceptor();
}
/**
* @return copy of ongoing replica-sync operations
*/
public List getOngoingReplicaSyncRequests() {
return replicaManager.getOngoingReplicaSyncRequests();
}
/**
* @return copy of scheduled replica-sync requests
*/
public List> getScheduledReplicaSyncRequests() {
return replicaManager.getScheduledReplicaSyncRequests();
}
public PartitionStateManager getPartitionStateManager() {
return partitionStateManager;
}
public MigrationManager getMigrationManager() {
return migrationManager;
}
public PartitionReplicaManager getReplicaManager() {
return replicaManager;
}
@Override
public PartitionReplicaStateChecker getPartitionReplicaStateChecker() {
return partitionReplicaStateChecker;
}
public PartitionEventManager getPartitionEventManager() {
return partitionEventManager;
}
@Override
public boolean isFetchMostRecentPartitionTableTaskRequired() {
return shouldFetchPartitionTables;
}
boolean scheduleFetchMostRecentPartitionTableTaskIfRequired() {
partitionServiceLock.lock();
try {
if (shouldFetchPartitionTables) {
migrationManager.schedule(new FetchMostRecentPartitionTableTask());
return true;
}
return false;
} finally {
partitionServiceLock.unlock();
}
}
public void replaceMember(Member oldMember, Member newMember) {
partitionServiceLock.lock();
try {
partitionStateManager.replaceMember(oldMember, newMember);
} finally {
partitionServiceLock.unlock();
}
}
@Override
public PartitionTableView createPartitionTableView() {
partitionServiceLock.lock();
try {
return partitionStateManager.getPartitionTable();
} finally {
partitionServiceLock.unlock();
}
}
public PartitionTableView getLeftMemberSnapshot(UUID uuid) {
return partitionStateManager.getSnapshot(uuid);
}
/**
* Returns true only if local member is the last known master by
* {@code InternalPartitionServiceImpl}.
*
* Last known master is updated under partition service lock,
* when the former master leaves the cluster.
*
* This method should be used instead of {@code node.isMaster()}
* when the logic relies on being master.
*/
public boolean isLocalMemberMaster() {
return isMemberMaster(node.getThisAddress());
}
/**
* Returns true only if the member is the last known master by
* {@code InternalPartitionServiceImpl} and {@code ClusterServiceImpl}.
*
* Last known master is updated under partition service lock,
* when the former master leaves the cluster.
*/
public boolean isMemberMaster(Address address) {
if (address == null) {
return false;
}
Address master = latestMaster;
ClusterServiceImpl clusterService = node.getClusterService();
// If this is the only node, we can rely on ClusterService only.
if (master == null && clusterService.getSize() == 1) {
master = clusterService.getMasterAddress();
}
// address should be the known master by both PartitionService and ClusterService.
return address.equals(master) && address.equals(clusterService.getMasterAddress());
}
@SuppressWarnings("checkstyle:npathcomplexity")
public boolean commitMigrationOnDestination(MigrationInfo migration, Address sender) {
partitionServiceLock.lock();
try {
if (!validateSenderIsMaster(sender, "migration commit")) {
return false;
}
InternalPartitionImpl partition = partitionStateManager.getPartitionImpl(migration.getPartitionId());
int currentVersion = partition.version();
int initialVersion = migration.getInitialPartitionVersion();
int finalVersion = migration.getFinalPartitionVersion();
if (finalVersion == currentVersion) {
if (logger.isFineEnabled()) {
logger.fine("Already applied migration commit. Version: " + currentVersion + ", Master: " + sender);
}
return true;
}
if (finalVersion < currentVersion) {
if (logger.isFineEnabled()) {
logger.fine("Already applied migration commit. Local version: " + currentVersion
+ ", Master version: " + finalVersion + " Master: " + sender);
}
return false;
}
if (initialVersion != currentVersion) {
throw new IllegalStateException("Invalid migration commit! Expected version: " + initialVersion
+ ", current version: " + currentVersion + ", Master: " + sender);
}
MigrationInfo activeMigration = migrationManager.getActiveMigration(migration.getPartitionId());
assert migration.equals(activeMigration) : "Committed migration: " + migration
+ ", Active migration: " + activeMigration;
boolean added = migrationManager.addCompletedMigration(migration);
assert added : "Could not add completed migration on destination: " + migration;
applyMigration(partition, migration);
assert partition.version() == finalVersion
: "Current: " + partition.version() + ", Expected: " + finalVersion;
activeMigration.setStatus(migration.getStatus());
migrationManager.finalizeMigration(migration);
if (logger.isFineEnabled()) {
logger.fine("Committed " + migration + " on destination with partition version: " + finalVersion);
}
return true;
} finally {
partitionServiceLock.unlock();
}
}
/**
* Invoked on a node when it becomes master. It will receive partition states from all members and consolidate them into one.
* It guarantees the monotonicity of the partition table.
*
* - Fetch partition tables from all cluster members
* - Pick the most up to date partition table and apply it to local
* - Complete the pending migration, if present
* - Send the new partition table to all cluster members
*
*/
private class FetchMostRecentPartitionTableTask implements MigrationRunnable {
private final Address thisAddress = node.getThisAddress();
private InternalPartition[] latestPartitions;
private boolean initialized = partitionStateManager.isInitialized();
public void run() {
ClusterState clusterState = node.getClusterService().getClusterState();
if (!clusterState.isMigrationAllowed() && !clusterState.isPartitionPromotionAllowed()) {
// If migrations and promotions are not allowed, partition table cannot be modified and we should have
// the most recent partition table already. Because cluster state cannot be changed
// when our partition table is stale.
logger.fine("No need to fetch the latest partition table. "
+ "Cluster state does not allow to modify partition table.");
shouldFetchPartitionTables = false;
return;
}
syncWithPartitionThreads();
logger.info("Fetching partition tables from cluster to determine the most recent one... "
+ "Local stamp: " + partitionStateManager.getStamp());
latestPartitions = partitionStateManager.getPartitionsCopy(true);
Collection allCompletedMigrations = new HashSet<>();
Collection allActiveMigrations = new HashSet<>();
collectAndProcessResults(allCompletedMigrations, allActiveMigrations);
logger.info("Most recent partition table is determined.");
processNewState(allCompletedMigrations, allActiveMigrations);
publishPartitionRuntimeState();
}
private Future fetchPartitionState(Member m) {
return nodeEngine.getOperationService()
.invokeOnTarget(SERVICE_NAME, new FetchPartitionStateOperation(), m.getAddress());
}
/** Collects all completed and active migrations and sets the partition state to the latest version. */
private void collectAndProcessResults(Collection allCompletedMigrations,
Collection allActiveMigrations) {
Collection members = node.clusterService.getMembers(NON_LOCAL_MEMBER_SELECTOR);
Map> futures = new HashMap<>();
for (Member member : members) {
Future future = fetchPartitionState(member);
futures.put(member, future);
}
while (!futures.isEmpty()) {
Iterator>> iter = futures.entrySet().iterator();
while (iter.hasNext()) {
PartitionRuntimeState state = collectNextPartitionState(iter);
if (state == null) {
// state can be null, if not initialized or operation is retried
continue;
}
for (InternalPartition partition : state.getPartitions()) {
int partitionId = partition.getPartitionId();
InternalPartition latestPartition = latestPartitions[partitionId];
if (latestPartition.version() < partition.version()) {
latestPartitions[partitionId] = partition;
initialized = true;
} else {
if (latestPartition.version() == partition.version()) {
if (!latestPartition.equals(partition)) {
logger.warning("Issue while determining latest partition... Latest: " + latestPartition
+ ", Received: " + partition);
}
}
}
}
allCompletedMigrations.addAll(state.getCompletedMigrations());
if (state.getActiveMigrations() != null) {
allActiveMigrations.addAll(state.getActiveMigrations());
}
}
}
}
/**
* Fetches known partition state from next member and returns null if target member is left
* and/or not a member of this cluster anymore.
* If future timeouts, then fetch operation is retried until we learn target member's partition state
* or it leaves the cluster.
*/
private PartitionRuntimeState collectNextPartitionState(Iterator>> iter) {
Map.Entry> next = iter.next();
Member member = next.getKey();
Future future = next.getValue();
boolean collectedState = true;
try {
PartitionRuntimeState state = future.get(FETCH_PARTITION_STATE_SECONDS, SECONDS);
if (state == null) {
logger.fine("Received NULL partition state from " + member);
} else {
logger.fine("Received partition state version from " + member);
}
return state;
} catch (InterruptedException e) {
logger.fine("FetchMostRecentPartitionTableTask is interrupted.");
Thread.currentThread().interrupt();
} catch (TimeoutException e) {
collectedState = false;
// Fetch partition state operation is idempotent.
// We will retry it until it we learn the partition state or the member leaves the cluster.
// We can't just rely on invocation retries, because if connection is dropped while
// our operation is on the wire, invocation won't get any response and will eventually timeout.
next.setValue(fetchPartitionState(member));
} catch (Exception e) {
Level level = Level.SEVERE;
if ((e instanceof MemberLeftException) || (e.getCause() instanceof TargetNotMemberException)) {
level = Level.FINE;
}
logger.log(level, "Failed to fetch partition table from " + member, e);
} finally {
if (collectedState) {
iter.remove();
}
}
return null;
}
/**
* Applies a partition state and marks all migrations (including local) as complete, when a newer state is received.
* The method will acquire the partition state lock.
*
* @param allCompletedMigrations received completed migrations from other nodes
* @param allActiveMigrations received active migrations from other nodes
*/
private void processNewState(Collection allCompletedMigrations,
Collection allActiveMigrations) {
partitionServiceLock.lock();
try {
processMigrations(allCompletedMigrations, allActiveMigrations);
if (initialized) {
for (MigrationInfo migration : allCompletedMigrations) {
if (migration.getStatus() == MigrationStatus.FAILED) {
// Apply version increment of failed active migrations.
int partitionId = migration.getPartitionId();
InternalPartition partition = latestPartitions[partitionId];
latestPartitions[partitionId] = new ReadonlyInternalPartition(partition.getReplicasCopy(),
partitionId, migration.getFinalPartitionVersion());
}
}
logger.info("Applying the most recent of partition state...");
applyNewPartitionTable(latestPartitions, allCompletedMigrations, thisAddress);
}
shouldFetchPartitionTables = false;
} catch (Throwable rethrowed) {
String lineSeparator = System.lineSeparator();
StringBuilder sb = new StringBuilder()
.append("latestPartitions:").append(lineSeparator)
.append(StringUtil.toString(latestPartitions)).append(lineSeparator)
.append("allCompletedMigrations:").append(lineSeparator)
.append(StringUtil.toString(allCompletedMigrations)).append(lineSeparator)
.append("allActiveMigrations:").append(lineSeparator)
.append(StringUtil.toString((allActiveMigrations))).append(lineSeparator)
.append(rethrowed);
logger.warning(sb.toString());
throw rethrowed;
} finally {
partitionServiceLock.unlock();
}
}
/** Moves all migrations to completed (including local) and marks active migrations as {@link MigrationStatus#FAILED}. */
private void processMigrations(Collection allCompletedMigrations,
Collection allActiveMigrations) {
allCompletedMigrations.addAll(migrationManager.getCompletedMigrationsCopy());
allActiveMigrations.addAll(migrationManager.getActiveMigrations());
for (MigrationInfo migration : allActiveMigrations) {
if (allCompletedMigrations.add(migration)) {
logger.info("Marking active migration " + migration + " as " + MigrationStatus.FAILED);
migration.setStatus(MigrationStatus.FAILED);
migration.setPartitionVersionIncrement(migration.getPartitionVersionIncrement() + 1);
}
}
}
private void syncWithPartitionThreads() {
OperationExecutor opExecutor = nodeEngine.getOperationService().getOperationExecutor();
CountDownLatch latch = new CountDownLatch(opExecutor.getPartitionThreadCount());
opExecutor.executeOnPartitionThreads(new PartitionThreadBarrierTask(latch));
try {
latch.await();
} catch (InterruptedException e) {
logger.warning(e);
Thread.currentThread().interrupt();
}
}
/**
* PartitionThreadBarrierTask is executed on all partition operation threads
* and to ensure all pending/running migration operations are completed.
*/
private final class PartitionThreadBarrierTask implements Runnable, UrgentSystemOperation {
private final CountDownLatch latch;
private PartitionThreadBarrierTask(CountDownLatch latch) {
this.latch = latch;
}
@Override
public void run() {
latch.countDown();
}
}
}
@Override
public String toString() {
return "InternalPartitionService {"
+ "stamp: " + getPartitionStateStamp() + ", migrationQ: " + getMigrationQueueSize() + "}";
}
}