Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Copyright (c) 2008-2015, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.partition.impl;
import com.hazelcast.cluster.MemberInfo;
import com.hazelcast.core.HazelcastException;
import com.hazelcast.core.Member;
import com.hazelcast.core.MigrationEvent;
import com.hazelcast.core.MigrationEvent.MigrationStatus;
import com.hazelcast.core.MigrationListener;
import com.hazelcast.instance.MemberImpl;
import com.hazelcast.instance.Node;
import com.hazelcast.instance.OutOfMemoryErrorDispatcher;
import com.hazelcast.logging.ILogger;
import com.hazelcast.logging.Logger;
import com.hazelcast.nio.Address;
import com.hazelcast.nio.serialization.Data;
import com.hazelcast.partition.InternalPartition;
import com.hazelcast.partition.InternalPartitionLostEvent;
import com.hazelcast.partition.InternalPartitionService;
import com.hazelcast.partition.MigrationEndpoint;
import com.hazelcast.partition.MigrationInfo;
import com.hazelcast.partition.PartitionEvent;
import com.hazelcast.partition.PartitionEventListener;
import com.hazelcast.partition.PartitionInfo;
import com.hazelcast.partition.PartitionLostEvent;
import com.hazelcast.partition.PartitionLostListener;
import com.hazelcast.partition.PartitionRuntimeState;
import com.hazelcast.partition.PartitionServiceProxy;
import com.hazelcast.partition.membergroup.MemberGroup;
import com.hazelcast.partition.membergroup.MemberGroupFactory;
import com.hazelcast.partition.membergroup.MemberGroupFactoryFactory;
import com.hazelcast.spi.Callback;
import com.hazelcast.spi.EventPublishingService;
import com.hazelcast.spi.EventRegistration;
import com.hazelcast.spi.EventService;
import com.hazelcast.spi.ExecutionService;
import com.hazelcast.spi.InvocationBuilder;
import com.hazelcast.spi.ManagedService;
import com.hazelcast.spi.NodeEngine;
import com.hazelcast.spi.Operation;
import com.hazelcast.spi.OperationService;
import com.hazelcast.spi.PartitionAwareService;
import com.hazelcast.spi.ResponseHandler;
import com.hazelcast.spi.impl.NodeEngineImpl;
import com.hazelcast.spi.impl.ResponseHandlerFactory;
import com.hazelcast.util.Clock;
import com.hazelcast.util.ExceptionUtil;
import com.hazelcast.util.FutureUtil.ExceptionHandler;
import com.hazelcast.util.scheduler.CoalescingDelayedTrigger;
import com.hazelcast.util.scheduler.EntryTaskScheduler;
import com.hazelcast.util.scheduler.EntryTaskSchedulerFactory;
import com.hazelcast.util.scheduler.ScheduleType;
import com.hazelcast.util.scheduler.ScheduledEntry;
import com.hazelcast.util.scheduler.ScheduledEntryProcessor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReferenceArray;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.logging.Level;
import static com.hazelcast.partition.impl.InternalPartitionServiceState.MIGRATION_LOCAL;
import static com.hazelcast.partition.impl.InternalPartitionServiceState.MIGRATION_ON_MASTER;
import static com.hazelcast.partition.impl.InternalPartitionServiceState.SAFE;
import static com.hazelcast.partition.impl.InternalPartitionServiceState.REPLICA_NOT_SYNC;
import static com.hazelcast.util.FutureUtil.logAllExceptions;
import static com.hazelcast.util.FutureUtil.waitWithDeadline;
/**
* The {@link InternalPartitionService} implementation.
*/
public class InternalPartitionServiceImpl implements InternalPartitionService, ManagedService,
EventPublishingService>, PartitionAwareService {
private static final String EXCEPTION_MSG_PARTITION_STATE_SYNC_TIMEOUT = "Partition state sync invocation timed out";
private static final int DEFAULT_PAUSE_MILLIS = 1000;
private static final int PARTITION_OWNERSHIP_WAIT_MILLIS = 10;
private static final int REPLICA_SYNC_CHECK_TIMEOUT_SECONDS = 10;
private final Node node;
private final NodeEngineImpl nodeEngine;
private final ILogger logger;
private final int partitionCount;
private final InternalPartitionImpl[] partitions;
private final PartitionReplicaVersions[] replicaVersions;
private final AtomicReferenceArray replicaSyncRequests;
private final EntryTaskScheduler replicaSyncScheduler;
private final Semaphore replicaSyncProcessLock;
private final MigrationThread migrationThread;
private final long partitionMigrationInterval;
private final long partitionMigrationTimeout;
private final long backupSyncCheckInterval;
private final int maxParallelReplications;
private final PartitionStateGenerator partitionStateGenerator;
private final MemberGroupFactory memberGroupFactory;
private final PartitionServiceProxy proxy;
private final Lock lock = new ReentrantLock();
private final AtomicInteger stateVersion = new AtomicInteger();
private final BlockingQueue migrationQueue = new LinkedBlockingQueue();
private final AtomicBoolean migrationActive = new AtomicBoolean(true);
private final AtomicLong lastRepartitionTime = new AtomicLong();
private final CoalescingDelayedTrigger delayedResumeMigrationTrigger;
private final ExceptionHandler partitionStateSyncTimeoutHandler;
// can be read and written concurrently...
private volatile int memberGroupsSize;
// updates will be done under lock, but reads will be multithreaded.
private volatile boolean initialized;
// updates will be done under lock, but reads will be multithreaded.
private final ConcurrentMap activeMigrations
= new ConcurrentHashMap(3, 0.75f, 1);
// both reads and updates will be done under lock!
private final LinkedList completedMigrations = new LinkedList();
public InternalPartitionServiceImpl(Node node) {
this.partitionCount = node.groupProperties.PARTITION_COUNT.getInteger();
this.node = node;
this.nodeEngine = node.nodeEngine;
this.logger = node.getLogger(InternalPartitionService.class);
partitionStateSyncTimeoutHandler =
logAllExceptions(logger, EXCEPTION_MSG_PARTITION_STATE_SYNC_TIMEOUT, Level.FINEST);
this.partitions = new InternalPartitionImpl[partitionCount];
PartitionListener partitionListener = new LocalPartitionListener(this, node.getThisAddress());
for (int i = 0; i < partitionCount; i++) {
this.partitions[i] = new InternalPartitionImpl(i, partitionListener, node.getThisAddress());
}
replicaVersions = new PartitionReplicaVersions[partitionCount];
for (int i = 0; i < replicaVersions.length; i++) {
replicaVersions[i] = new PartitionReplicaVersions(i);
}
memberGroupFactory = MemberGroupFactoryFactory.newMemberGroupFactory(node.getConfig().getPartitionGroupConfig());
partitionStateGenerator = new PartitionStateGeneratorImpl();
long interval = node.groupProperties.PARTITION_MIGRATION_INTERVAL.getLong();
partitionMigrationInterval = interval > 0 ? TimeUnit.SECONDS.toMillis(interval) : 0;
partitionMigrationTimeout = TimeUnit.SECONDS.toMillis(
node.groupProperties.PARTITION_MIGRATION_TIMEOUT.getLong());
migrationThread = new MigrationThread(node);
proxy = new PartitionServiceProxy(this);
ExecutionService executionService = nodeEngine.getExecutionService();
ScheduledExecutorService scheduledExecutor = executionService.getDefaultScheduledExecutor();
// The reason behind this scheduler to have POSTPONE type is as follows:
// When a node shifts up in the replica table upon a node failure, it sends a sync request to the partition owner and
// registers it to the replicaSyncRequests. If another node fails before the already-running sync process completes,
// the new sync request is simply scheduled to a further time. Again, before the already-running sync process completes,
// if another node fails for the third time, the already-scheduled sync request should be overwritten with the new one.
// This is because this node is shifted up to a higher level when the third node failure occurs and its respective sync
// request will inherently include the backup data that is requested by the previously scheduled sync request.
replicaSyncScheduler = EntryTaskSchedulerFactory.newScheduler(scheduledExecutor,
new ReplicaSyncEntryProcessor(this), ScheduleType.POSTPONE);
replicaSyncRequests = new AtomicReferenceArray(partitionCount);
long maxMigrationDelayMs = calculateMaxMigrationDelayOnMemberRemoved();
long minMigrationDelayMs = calculateMigrationDelayOnMemberRemoved(maxMigrationDelayMs);
this.delayedResumeMigrationTrigger = new CoalescingDelayedTrigger(
executionService, minMigrationDelayMs, maxMigrationDelayMs, new Runnable() {
@Override
public void run() {
resumeMigration();
}
});
long definedBackupSyncCheckInterval = node.groupProperties.PARTITION_BACKUP_SYNC_INTERVAL.getInteger();
backupSyncCheckInterval = definedBackupSyncCheckInterval > 0 ? definedBackupSyncCheckInterval : 1;
maxParallelReplications = node.groupProperties.PARTITION_MAX_PARALLEL_REPLICATIONS.getInteger();
replicaSyncProcessLock = new Semaphore(maxParallelReplications);
}
private long calculateMaxMigrationDelayOnMemberRemoved() {
//hard limit for migration pause is half of the call timeout. otherwise we might experience timeouts
return node.groupProperties.OPERATION_CALL_TIMEOUT_MILLIS.getLong() / 2;
}
private long calculateMigrationDelayOnMemberRemoved(long maxDelayMs) {
long migrationDelayMs = node.groupProperties.MIGRATION_MIN_DELAY_ON_MEMBER_REMOVED_SECONDS.getLong() * 1000;
long connectionErrorDetectionIntervalMs = node.groupProperties.CONNECTION_MONITOR_INTERVAL.getLong()
* node.groupProperties.CONNECTION_MONITOR_MAX_FAULTS.getInteger() * 5;
migrationDelayMs = Math.max(migrationDelayMs, connectionErrorDetectionIntervalMs);
long heartbeatIntervalMs = node.groupProperties.HEARTBEAT_INTERVAL_SECONDS.getLong() * 1000;
migrationDelayMs = Math.max(migrationDelayMs, heartbeatIntervalMs * 3);
migrationDelayMs = Math.min(migrationDelayMs, maxDelayMs);
return migrationDelayMs;
}
@Override
public void init(NodeEngine nodeEngine, Properties properties) {
migrationThread.start();
int partitionTableSendInterval = node.groupProperties.PARTITION_TABLE_SEND_INTERVAL.getInteger();
if (partitionTableSendInterval <= 0) {
partitionTableSendInterval = 1;
}
ExecutionService executionService = nodeEngine.getExecutionService();
executionService.scheduleAtFixedRate(new SendClusterStateTask(),
partitionTableSendInterval, partitionTableSendInterval, TimeUnit.SECONDS);
executionService.scheduleWithFixedDelay(new SyncReplicaVersionTask(),
backupSyncCheckInterval, backupSyncCheckInterval, TimeUnit.SECONDS);
}
@Override
public Address getPartitionOwner(int partitionId) {
if (!initialized) {
firstArrangement();
}
if (partitions[partitionId].getOwnerOrNull() == null && !node.isMaster() && node.joined()) {
notifyMasterToAssignPartitions();
}
return partitions[partitionId].getOwnerOrNull();
}
@Override
public Address getPartitionOwnerOrWait(int partition) {
Address owner = getPartitionOwner(partition);
while (owner == null) {
try {
Thread.sleep(PARTITION_OWNERSHIP_WAIT_MILLIS);
} catch (InterruptedException e) {
ExceptionUtil.rethrow(e);
}
owner = getPartitionOwner(partition);
}
return owner;
}
private void notifyMasterToAssignPartitions() {
if (initialized) {
return;
}
if (lock.tryLock()) {
try {
if (!initialized && !node.isMaster() && node.getMasterAddress() != null && node.joined()) {
Future f = nodeEngine.getOperationService().createInvocationBuilder(SERVICE_NAME, new AssignPartitions(),
node.getMasterAddress()).setTryCount(1).invoke();
f.get(1, TimeUnit.SECONDS);
}
} catch (Exception e) {
logger.finest(e);
} finally {
lock.unlock();
}
}
}
@Override
public void firstArrangement() {
if (!node.isMaster() || !node.isActive()) {
notifyMasterToAssignPartitions();
return;
}
if (!initialized) {
lock.lock();
try {
if (initialized) {
return;
}
PartitionStateGenerator psg = partitionStateGenerator;
final Set members = node.getClusterService().getMembers();
Collection memberGroups = memberGroupFactory.createMemberGroups(members);
if (memberGroups.isEmpty()) {
logger.warning("No member group is available to assign partition ownership...");
return;
}
logger.info("Initializing cluster partition table first arrangement...");
Address[][] newState = psg.initialize(memberGroups, partitionCount);
if (newState.length != partitionCount) {
throw new HazelcastException("Invalid partition count! "
+ "Expected: " + partitionCount + ", Actual: " + newState.length);
}
for (int partitionId = 0; partitionId < partitionCount; partitionId++) {
InternalPartitionImpl partition = partitions[partitionId];
Address[] replicas = newState[partitionId];
partition.setReplicaAddresses(replicas);
}
initialized = true;
publishPartitionRuntimeState();
} finally {
lock.unlock();
}
}
}
private void updateMemberGroupsSize() {
Set members = node.getClusterService().getMembers();
final Collection groups = memberGroupFactory.createMemberGroups(members);
int size = 0;
for (MemberGroup group : groups) {
if (group.size() > 0) {
size++;
}
}
memberGroupsSize = size;
}
@Override
public int getMemberGroupsSize() {
int size = memberGroupsSize;
// size = 0 means service is not initialized yet.
// return 1 instead since there should be at least one member group
return size > 0 ? size : 1;
}
@Override
public int getMaxBackupCount() {
return Math.min(getMemberGroupsSize() - 1, InternalPartition.MAX_BACKUP_COUNT);
}
public void memberAdded(MemberImpl member) {
if (!member.localMember()) {
updateMemberGroupsSize();
}
if (node.isMaster() && node.isActive()) {
lock.lock();
try {
migrationQueue.clear();
if (initialized) {
migrationQueue.add(new RepartitioningTask());
// send initial partition table to newly joined node.
Collection members = node.clusterService.getMemberList();
PartitionStateOperation op = new PartitionStateOperation(createPartitionState(members));
nodeEngine.getOperationService().send(op, member.getAddress());
}
} finally {
lock.unlock();
}
}
}
public void memberRemoved(final MemberImpl member) {
updateMemberGroupsSize();
final Address deadAddress = member.getAddress();
final Address thisAddress = node.getThisAddress();
if (deadAddress == null || deadAddress.equals(thisAddress)) {
return;
}
lock.lock();
try {
migrationQueue.clear();
if (!activeMigrations.isEmpty()) {
if (node.isMaster()) {
rollbackActiveMigrationsFromPreviousMaster(node.getLocalMember().getUuid());
}
for (MigrationInfo migrationInfo : activeMigrations.values()) {
if (deadAddress.equals(migrationInfo.getSource()) || deadAddress.equals(migrationInfo.getDestination())) {
migrationInfo.invalidate();
}
}
}
// Pause migration and let all other members notice the dead member
// and fix their own partitions.
// Otherwise new master may take action fast and send new partition state
// before other members realize the dead one.
pauseMigration();
cancelReplicaSyncRequestsTo(deadAddress);
removeDeadAddress(deadAddress, thisAddress);
if (node.isMaster() && initialized) {
migrationQueue.add(new RepartitioningTask());
}
resumeMigrationEventually();
} finally {
lock.unlock();
}
}
private void cancelReplicaSyncRequestsTo(Address deadAddress) {
for (int partitionId = 0; partitionId < partitionCount; partitionId++) {
ReplicaSyncInfo syncInfo = replicaSyncRequests.get(partitionId);
if (syncInfo != null && deadAddress.equals(syncInfo.target)) {
cancelReplicaSync(partitionId);
}
}
}
void cancelReplicaSync(int partitionId) {
ReplicaSyncInfo syncInfo = replicaSyncRequests.get(partitionId);
if (syncInfo != null && replicaSyncRequests.compareAndSet(partitionId, syncInfo, null)) {
replicaSyncScheduler.cancel(partitionId);
releaseReplicaSyncPermit();
}
}
private void resumeMigrationEventually() {
delayedResumeMigrationTrigger.executeWithDelay();
}
private void removeDeadAddress(Address deadAddress, Address thisAddress) {
for (InternalPartitionImpl partition : partitions) {
if (deadAddress.equals(partition.getOwnerOrNull()) && thisAddress.equals(partition.getReplicaAddress(1))) {
partition.setMigrating(true);
}
// shift partition table up.
partition.onDeadAddress(deadAddress);
// safety check!
if (partition.onDeadAddress(deadAddress)) {
throw new IllegalStateException("Duplicate address found in partition replicas!");
}
}
}
private void rollbackActiveMigrationsFromPreviousMaster(final String currentMasterUuid) {
lock.lock();
try {
if (!activeMigrations.isEmpty()) {
for (MigrationInfo migrationInfo : activeMigrations.values()) {
if (!currentMasterUuid.equals(migrationInfo.getMasterUuid())) {
// Still there is possibility of the other endpoint commits the migration
// but this node roll-backs!
logger.info("Rolling-back migration initiated by the old master -> " + migrationInfo);
finalizeActiveMigration(migrationInfo);
}
}
}
} finally {
lock.unlock();
}
}
private PartitionRuntimeState createPartitionState(Collection members) {
lock.lock();
try {
List memberInfos = new ArrayList(members.size());
for (MemberImpl member : members) {
MemberInfo memberInfo = new MemberInfo(member.getAddress(), member.getUuid(), member.getAttributes());
memberInfos.add(memberInfo);
}
ArrayList migrationInfos = new ArrayList(completedMigrations);
ILogger logger = node.getLogger(PartitionRuntimeState.class);
return new PartitionRuntimeState(logger, memberInfos, partitions, migrationInfos, stateVersion.get());
} finally {
lock.unlock();
}
}
private void publishPartitionRuntimeState() {
if (!initialized) {
// do not send partition state until initialized!
return;
}
if (!node.isMaster() || !node.isActive() || !node.joined()) {
return;
}
if (!isMigrationActive()) {
// migration is disabled because of a member leave, wait till enabled!
return;
}
lock.lock();
try {
Collection members = node.clusterService.getMemberList();
PartitionRuntimeState partitionState = createPartitionState(members);
PartitionStateOperation op = new PartitionStateOperation(partitionState);
OperationService operationService = nodeEngine.getOperationService();
for (MemberImpl member : members) {
if (!member.localMember()) {
try {
operationService.send(op, member.getAddress());
} catch (Exception e) {
logger.finest(e);
}
}
}
} finally {
lock.unlock();
}
}
private void syncPartitionRuntimeState() {
syncPartitionRuntimeState(node.clusterService.getMemberList());
}
private void syncPartitionRuntimeState(Collection members) {
if (!initialized) {
// do not send partition state until initialized!
return;
}
if (!node.isMaster() || !node.isActive() || !node.joined()) {
return;
}
lock.lock();
try {
PartitionRuntimeState partitionState = createPartitionState(members);
OperationService operationService = nodeEngine.getOperationService();
List calls = firePartitionStateOperation(members, partitionState, operationService);
waitWithDeadline(calls, 3, TimeUnit.SECONDS, partitionStateSyncTimeoutHandler);
} finally {
lock.unlock();
}
}
private List firePartitionStateOperation(Collection members,
PartitionRuntimeState partitionState,
OperationService operationService) {
List calls = new ArrayList(members.size());
for (MemberImpl member : members) {
if (!member.localMember()) {
try {
Address address = member.getAddress();
PartitionStateOperation operation = new PartitionStateOperation(partitionState, true);
Future