All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hazelcast.internal.partition.impl.PartitionReplicaManager Maven / Gradle / Ivy

/*
 * Copyright (c) 2008-2018, Hazelcast, Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.internal.partition.impl;

import com.hazelcast.instance.Node;
import com.hazelcast.internal.metrics.Probe;
import com.hazelcast.internal.partition.InternalPartition;
import com.hazelcast.internal.partition.NonFragmentedServiceNamespace;
import com.hazelcast.internal.partition.PartitionReplicaVersionManager;
import com.hazelcast.internal.partition.operation.PartitionReplicaSyncRequest;
import com.hazelcast.internal.util.counters.MwCounter;
import com.hazelcast.logging.ILogger;
import com.hazelcast.nio.Address;
import com.hazelcast.spi.ExecutionService;
import com.hazelcast.spi.Operation;
import com.hazelcast.spi.ServiceNamespace;
import com.hazelcast.spi.ServiceNamespaceAware;
import com.hazelcast.spi.TaskScheduler;
import com.hazelcast.spi.impl.NodeEngineImpl;
import com.hazelcast.spi.impl.PartitionSpecificRunnable;
import com.hazelcast.spi.properties.GroupProperty;
import com.hazelcast.spi.properties.HazelcastProperties;
import com.hazelcast.util.scheduler.EntryTaskScheduler;
import com.hazelcast.util.scheduler.EntryTaskSchedulerFactory;
import com.hazelcast.util.scheduler.ScheduleType;
import com.hazelcast.util.scheduler.ScheduledEntry;
import com.hazelcast.util.scheduler.ScheduledEntryProcessor;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;

import static com.hazelcast.internal.util.counters.MwCounter.newMwCounter;
import static java.util.Collections.newSetFromMap;

/**
 *
 * Maintains the version values for the partition replicas and manages the replica-related operations for partitions
 *
 */
public class PartitionReplicaManager implements PartitionReplicaVersionManager {

    private final Node node;
    private final NodeEngineImpl nodeEngine;
    private final ILogger logger;
    private final InternalPartitionServiceImpl partitionService;
    private final PartitionStateManager partitionStateManager;

    private final PartitionReplicaVersions[] replicaVersions;
    /** Replica sync requests that have been sent to the target and awaiting response */
    private final Set replicaSyncRequests;
    private final EntryTaskScheduler replicaSyncTimeoutScheduler;
    @Probe
    private final Semaphore replicaSyncProcessLock;
    @Probe
    private final MwCounter replicaSyncRequestsCounter = newMwCounter();

    private final long partitionMigrationTimeout;
    private final int maxParallelReplications;

    PartitionReplicaManager(Node node, InternalPartitionServiceImpl partitionService) {
        this.node = node;
        this.nodeEngine = node.nodeEngine;
        this.logger = node.getLogger(getClass());
        this.partitionService = partitionService;

        int partitionCount = partitionService.getPartitionCount();
        partitionStateManager = partitionService.getPartitionStateManager();

        HazelcastProperties properties = node.getProperties();
        partitionMigrationTimeout = properties.getMillis(GroupProperty.PARTITION_MIGRATION_TIMEOUT);
        maxParallelReplications = properties.getInteger(GroupProperty.PARTITION_MAX_PARALLEL_REPLICATIONS);
        replicaSyncProcessLock = new Semaphore(maxParallelReplications);

        replicaVersions = new PartitionReplicaVersions[partitionCount];
        for (int i = 0; i < replicaVersions.length; i++) {
            replicaVersions[i] = new PartitionReplicaVersions(i);
        }

        ExecutionService executionService = nodeEngine.getExecutionService();
        TaskScheduler globalScheduler = executionService.getGlobalTaskScheduler();

        // The reason behind this scheduler to have POSTPONE type is as follows:
        // When a node shifts up in the replica table upon a node failure, it sends a sync request to the partition owner and
        // registers it to the replicaSyncRequests. If another node fails before the already-running sync process completes,
        // the new sync request is simply scheduled to a further time. Again, before the already-running sync process completes,
        // if another node fails for the third time, the already-scheduled sync request should be overwritten with the new one.
        // This is because this node is shifted up to a higher level when the third node failure occurs and its respective sync
        // request will inherently include the backup data that is requested by the previously scheduled sync request.
        replicaSyncTimeoutScheduler = EntryTaskSchedulerFactory.newScheduler(globalScheduler,
                new ReplicaSyncTimeoutProcessor(), ScheduleType.POSTPONE);

        replicaSyncRequests = newSetFromMap(new ConcurrentHashMap(partitionCount));
    }

    /**
     * This method is called on a backup node (replica). Given all conditions are satisfied, this method initiates a replica sync
     * operation and registers it to replicaSyncRequest. The operation is scheduled for a future execution if :
     * 
    *
  • the {@code delayMillis} is greater than 0
  • *
  • if a migration is not allowed (during repartitioning or a node joining the cluster)
  • *
  • the partition is currently migrating
  • *
  • another sync request has already been sent
  • *
  • the maximum number of parallel synchronizations has already been reached
  • *
* * @param partitionId the partition which is being synchronized * @param namespaces namespaces of partition replica fragments * @param replicaIndex the index of the replica which is being synchronized * @throws IllegalArgumentException if the replica index is not between 0 and {@link InternalPartition#MAX_REPLICA_COUNT} */ public void triggerPartitionReplicaSync(int partitionId, Collection namespaces, int replicaIndex) { assert replicaIndex >= 0 && replicaIndex < InternalPartition.MAX_REPLICA_COUNT : "Invalid replica index! partitionId=" + partitionId + ", replicaIndex=" + replicaIndex; Address target = checkAndGetPrimaryReplicaOwner(partitionId, replicaIndex); if (target == null) { return; } if (!partitionService.isMigrationAllowed()) { logger.finest("Cannot send sync replica request for partitionId=" + partitionId + ", replicaIndex=" + replicaIndex + ", namespaces=" + namespaces + ". Sync is not allowed."); return; } InternalPartitionImpl partition = partitionStateManager.getPartitionImpl(partitionId); if (partition.isMigrating()) { logger.finest("Cannot send sync replica request for partitionId=" + partitionId + ", replicaIndex=" + replicaIndex + ", namespaces=" + namespaces + ". Partition is already migrating."); return; } sendSyncReplicaRequest(partitionId, namespaces, replicaIndex, target); } /** Checks preconditions for replica sync - if we don't know the owner yet, if this node is the owner or not a replica */ Address checkAndGetPrimaryReplicaOwner(int partitionId, int replicaIndex) { final InternalPartitionImpl partition = partitionStateManager.getPartitionImpl(partitionId); final Address target = partition.getOwnerOrNull(); if (target == null) { logger.info("Sync replica target is null, no need to sync -> partitionId=" + partitionId + ", replicaIndex=" + replicaIndex); return null; } Address thisAddress = nodeEngine.getThisAddress(); if (target.equals(thisAddress)) { if (logger.isFinestEnabled()) { logger.finest("This node is now owner of partition, cannot sync replica -> partitionId=" + partitionId + ", replicaIndex=" + replicaIndex + ", partition-info=" + partitionStateManager.getPartitionImpl(partitionId)); } return null; } if (!partition.isOwnerOrBackup(thisAddress)) { if (logger.isFinestEnabled()) { logger.finest("This node is not backup replica of partitionId=" + partitionId + ", replicaIndex=" + replicaIndex + " anymore."); } return null; } return target; } /** * Send the sync request to {@code target} if the max number of parallel sync requests has not been made and the target * was not removed while the cluster was not active. Also cancel any currently scheduled sync requests for the given * partition and schedule a new sync request that is to be run in the case of timeout */ private void sendSyncReplicaRequest(int partitionId, Collection syncNamespaces, int replicaIndex, Address target) { if (node.clusterService.isMemberRemovedInNotJoinableState(target)) { return; } if (!tryToAcquireReplicaSyncPermit()) { if (logger.isFinestEnabled()) { logger.finest("Cannot send sync replica request for partitionId=" + partitionId + ", replicaIndex=" + replicaIndex + ", namespaces=" + syncNamespaces + ". No permits available!"); } return; } Collection namespaces = registerSyncInfoFor(partitionId, syncNamespaces, replicaIndex, target); if (namespaces.isEmpty()) { releaseReplicaSyncPermit(); return; } if (logger.isFinestEnabled()) { logger.finest("Sending sync replica request for partitionId=" + partitionId + ", replicaIndex=" + replicaIndex + ", namespaces=" + namespaces); } replicaSyncRequestsCounter.inc(); PartitionReplicaSyncRequest syncRequest = new PartitionReplicaSyncRequest(partitionId, namespaces, replicaIndex); nodeEngine.getOperationService().send(syncRequest, target); } private Collection registerSyncInfoFor(int partitionId, Collection requestedNamespaces, int replicaIndex, Address target) { // namespaces arg may not support removal Collection namespaces = new ArrayList(requestedNamespaces); Iterator iter = namespaces.iterator(); while (iter.hasNext()) { ServiceNamespace namespace = iter.next(); ReplicaFragmentSyncInfo syncInfo = new ReplicaFragmentSyncInfo(partitionId, namespace, replicaIndex, target); if (!replicaSyncRequests.add(syncInfo)) { logger.finest("Cannot send sync replica request for " + syncInfo + ". Sync is already in progress!"); iter.remove(); continue; } replicaSyncTimeoutScheduler.schedule(partitionMigrationTimeout, syncInfo, null); } return namespaces; } @Override public ServiceNamespace getServiceNamespace(Operation operation) { if (operation instanceof ServiceNamespaceAware) { return ((ServiceNamespaceAware) operation).getServiceNamespace(); } return NonFragmentedServiceNamespace.INSTANCE; } @Override // Caution: Returning version array without copying for performance reasons. Callers must not modify this array! public long[] incrementPartitionReplicaVersions(int partitionId, ServiceNamespace namespace, int backupCount) { PartitionReplicaVersions replicaVersion = replicaVersions[partitionId]; return replicaVersion.incrementAndGet(namespace, backupCount); } @Override public void updatePartitionReplicaVersions(int partitionId, ServiceNamespace namespace, long[] versions, int replicaIndex) { PartitionReplicaVersions partitionVersion = replicaVersions[partitionId]; if (!partitionVersion.update(namespace, versions, replicaIndex)) { // this partition backup is behind the owner or dirty. triggerPartitionReplicaSync(partitionId, Collections.singleton(namespace), replicaIndex); } } @Override public boolean isPartitionReplicaVersionStale(int partitionId, ServiceNamespace namespace, long[] versions, int replicaIndex) { return replicaVersions[partitionId].isStale(namespace, versions, replicaIndex); } // called in operation threads public boolean isPartitionReplicaVersionDirty(int partitionId, ServiceNamespace namespace) { return replicaVersions[partitionId].isDirty(namespace); } @Override // Caution: Returning version array without copying for performance reasons. Callers must not modify this array! public long[] getPartitionReplicaVersions(int partitionId, ServiceNamespace namespace) { return replicaVersions[partitionId].get(namespace); } // called in operation threads public void setPartitionReplicaVersions(int partitionId, ServiceNamespace namespace, long[] versions, int replicaOffset) { replicaVersions[partitionId].set(namespace, versions, replicaOffset); } // called in operation threads public void clearPartitionReplicaVersions(int partitionId, ServiceNamespace namespace) { replicaVersions[partitionId].clear(namespace); } /** * Set the new replica versions for the partition with the {@code partitionId} and reset any ongoing replica * synchronization request for this partition and replica index. * * @param partitionId the partition ID * @param replicaIndex the index of the replica * @param versions the new replica versions for the partition */ // called in operation threads public void finalizeReplicaSync(int partitionId, int replicaIndex, ServiceNamespace namespace, long[] versions) { PartitionReplicaVersions replicaVersion = replicaVersions[partitionId]; replicaVersion.clear(namespace); replicaVersion.set(namespace, versions, replicaIndex); clearReplicaSyncRequest(partitionId, namespace, replicaIndex); } /** * Resets the state of the replica synchronization request for the given partition and replica. This will cancel the * scheduled synchronization, clear the ongoing sync flag and release a synchronization permit. * * @param partitionId the partition being synchronized * @param namespace namespace * @param replicaIndex the index of the replica being synchronized */ // called in operation threads public void clearReplicaSyncRequest(int partitionId, ServiceNamespace namespace, int replicaIndex) { ReplicaFragmentSyncInfo syncInfo = new ReplicaFragmentSyncInfo(partitionId, namespace, replicaIndex, null); if (!replicaSyncRequests.remove(syncInfo)) { return; } if (logger.isFinestEnabled()) { logger.finest("Clearing sync replica request for partitionId=" + partitionId + ", replicaIndex=" + replicaIndex + ", namespace=" + namespace); } releaseReplicaSyncPermit(); replicaSyncTimeoutScheduler.cancelIfExists(syncInfo, null); } void cancelReplicaSyncRequestsTo(Address deadAddress) { Iterator iter = replicaSyncRequests.iterator(); while (iter.hasNext()) { ReplicaFragmentSyncInfo syncInfo = iter.next(); if (deadAddress.equals(syncInfo.target)) { iter.remove(); replicaSyncTimeoutScheduler.cancel(syncInfo); releaseReplicaSyncPermit(); } } } void cancelReplicaSync(int partitionId) { Iterator iter = replicaSyncRequests.iterator(); while (iter.hasNext()) { ReplicaFragmentSyncInfo syncInfo = iter.next(); if (syncInfo.partitionId == partitionId) { iter.remove(); replicaSyncTimeoutScheduler.cancel(syncInfo); releaseReplicaSyncPermit(); } } } public boolean tryToAcquireReplicaSyncPermit() { return replicaSyncProcessLock.tryAcquire(); } public void releaseReplicaSyncPermit() { replicaSyncProcessLock.release(); } /** * @return copy of ongoing replica-sync operations */ List getOngoingReplicaSyncRequests() { return new ArrayList(replicaSyncRequests); } /** * @return copy of scheduled replica-sync requests */ List> getScheduledReplicaSyncRequests() { final List> entries = new ArrayList>(); for (ReplicaFragmentSyncInfo syncInfo : replicaSyncRequests) { ScheduledEntry entry = replicaSyncTimeoutScheduler.get(syncInfo); if (entry != null) { entries.add(entry); } } return entries; } void reset() { replicaSyncRequests.clear(); replicaSyncTimeoutScheduler.cancelAll(); // this is not sync with possibly running sync process // permit count can exceed allowed parallelization count. replicaSyncProcessLock.drainPermits(); replicaSyncProcessLock.release(maxParallelReplications); } void scheduleReplicaVersionSync(ExecutionService executionService) { long definedBackupSyncCheckInterval = node.getProperties().getSeconds(GroupProperty.PARTITION_BACKUP_SYNC_INTERVAL); long backupSyncCheckInterval = definedBackupSyncCheckInterval > 0 ? definedBackupSyncCheckInterval : 1; executionService.scheduleWithRepetition(new AntiEntropyTask(), backupSyncCheckInterval, backupSyncCheckInterval, TimeUnit.SECONDS); } @Override public Collection getNamespaces(int partitionId) { return replicaVersions[partitionId].getNamespaces(); } public void retainNamespaces(int partitionId, Set namespaces) { PartitionReplicaVersions versions = replicaVersions[partitionId]; versions.retainNamespaces(namespaces); } private class ReplicaSyncTimeoutProcessor implements ScheduledEntryProcessor { @Override public void process(EntryTaskScheduler scheduler, Collection> entries) { for (ScheduledEntry entry : entries) { ReplicaFragmentSyncInfo syncInfo = entry.getKey(); if (replicaSyncRequests.remove(syncInfo)) { releaseReplicaSyncPermit(); } } } } private class AntiEntropyTask implements Runnable { @Override public void run() { if (!node.nodeEngine.isRunning() || !node.getNodeExtension().isStartCompleted() || !partitionService.isMigrationAllowed()) { return; } for (InternalPartition partition : partitionStateManager.getPartitions()) { if (!partition.isLocal()) { continue; } PartitionSpecificRunnable r = new PartitionPrimaryReplicaAntiEntropyTask(nodeEngine, partition.getPartitionId()); nodeEngine.getOperationService().execute(r); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy