All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pulsar.metadata.coordination.impl.LeaderElectionImpl Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.pulsar.metadata.coordination.impl;

import com.fasterxml.jackson.databind.type.TypeFactory;
import com.google.common.annotations.VisibleForTesting;
import java.util.EnumSet;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import lombok.extern.slf4j.Slf4j;
import org.apache.bookkeeper.common.concurrent.FutureUtils;
import org.apache.pulsar.common.util.FutureUtil;
import org.apache.pulsar.metadata.api.GetResult;
import org.apache.pulsar.metadata.api.MetadataCache;
import org.apache.pulsar.metadata.api.MetadataCacheConfig;
import org.apache.pulsar.metadata.api.MetadataSerde;
import org.apache.pulsar.metadata.api.MetadataStoreException;
import org.apache.pulsar.metadata.api.MetadataStoreException.AlreadyClosedException;
import org.apache.pulsar.metadata.api.MetadataStoreException.BadVersionException;
import org.apache.pulsar.metadata.api.Notification;
import org.apache.pulsar.metadata.api.NotificationType;
import org.apache.pulsar.metadata.api.coordination.LeaderElection;
import org.apache.pulsar.metadata.api.coordination.LeaderElectionState;
import org.apache.pulsar.metadata.api.extended.CreateOption;
import org.apache.pulsar.metadata.api.extended.MetadataStoreExtended;
import org.apache.pulsar.metadata.api.extended.SessionEvent;
import org.apache.pulsar.metadata.cache.impl.JSONMetadataSerdeSimpleType;

@Slf4j
class LeaderElectionImpl implements LeaderElection {
    private final String path;
    private final MetadataSerde serde;
    private final MetadataStoreExtended store;
    private final MetadataCache cache;
    private final Consumer stateChangesListener;
    private final ScheduledFuture updateCachedValueFuture;

    private LeaderElectionState leaderElectionState;
    private Optional version = Optional.empty();
    private Optional proposedValue;

    private final ScheduledExecutorService executor;
    private final FutureUtil.Sequencer sequencer;

    private enum InternalState {
        Init, ElectionInProgress, LeaderIsPresent, Closed
    }

    private InternalState internalState;

    private static final int LEADER_ELECTION_RETRY_DELAY_SECONDS = 5;

    LeaderElectionImpl(MetadataStoreExtended store, Class clazz, String path,
            Consumer stateChangesListener,
                       ScheduledExecutorService executor) {
        this.path = path;
        this.serde = new JSONMetadataSerdeSimpleType<>(TypeFactory.defaultInstance().constructSimpleType(clazz, null));
        this.store = store;
        MetadataCacheConfig metadataCacheConfig = MetadataCacheConfig.builder()
                .expireAfterWriteMillis(-1L)
                .build();
        this.cache = store.getMetadataCache(clazz, metadataCacheConfig);
        this.leaderElectionState = LeaderElectionState.NoLeader;
        this.internalState = InternalState.Init;
        this.stateChangesListener = stateChangesListener;
        this.executor = executor;
        this.sequencer = FutureUtil.Sequencer.create();
        store.registerListener(this::handlePathNotification);
        store.registerSessionListener(this::handleSessionNotification);
        updateCachedValueFuture = executor.scheduleWithFixedDelay(this::getLeaderValue,
                metadataCacheConfig.getRefreshAfterWriteMillis() / 2,
                metadataCacheConfig.getRefreshAfterWriteMillis(), TimeUnit.MILLISECONDS);
    }

    @Override
    public synchronized CompletableFuture elect(T proposedValue) {
        if (leaderElectionState != LeaderElectionState.NoLeader) {
            return CompletableFuture.completedFuture(leaderElectionState);
        }

        this.proposedValue = Optional.of(proposedValue);
        return elect();
    }

    private synchronized CompletableFuture elect() {
        // First check if there's already a leader elected
        internalState = InternalState.ElectionInProgress;
        return store.get(path).thenCompose(optLock -> {
            if (optLock.isPresent()) {
                return handleExistingLeaderValue(optLock.get());
            } else {
                return tryToBecomeLeader();
            }
        }).thenCompose(leaderElectionState -> {
            // make sure that the cache contains the current leader
            // so that getLeaderValueIfPresent works on all brokers
            cache.refresh(path);
            return cache.get(path)
                    .thenApply(__ -> leaderElectionState);
        });
    }

    private synchronized CompletableFuture handleExistingLeaderValue(GetResult res) {
        T existingValue;
        try {
            existingValue = serde.deserialize(path, res.getValue(), res.getStat());
        } catch (Throwable t) {
            return FutureUtils.exception(t);
        }

        T value = proposedValue.orElse(null);
        if (existingValue.equals(value)) {
            // If the value is the same as our proposed value, it means this instance was the leader at some
            // point before. The existing value can either be for this same session or for a previous one.
            if (res.getStat().isCreatedBySelf()) {
                log.info("Keeping the existing value {} for {} as it's from the same session stat={}", existingValue,
                        path, res.getStat());
                // The value is still valid because it was created in the same session
                changeState(LeaderElectionState.Leading);
                return CompletableFuture.completedFuture(LeaderElectionState.Leading);
            } else {
                log.info("Conditionally deleting existing equals value {} for {} because it's not created in the "
                        + "current session. stat={}", existingValue, path, res.getStat());
                // Since the value was created in a different session, it might be expiring. We need to delete it
                // and try the election again.
                return store.delete(path, Optional.of(res.getStat().getVersion()))
                        .thenCompose(__ -> tryToBecomeLeader());
            }
        } else if (res.getStat().isCreatedBySelf()) {
            log.warn("Conditionally deleting existing value {} for {} because it's different from the proposed value "
                            + "({}). This is unexpected since it was created within the same session. "
                            + "In tests this could happen because of an invalid shared session id when using mocks.",
                    existingValue, path, value);
            // The existing value is different but was created from the same session
            return store.delete(path, Optional.of(res.getStat().getVersion()))
                    .thenCompose(__ -> tryToBecomeLeader());
        }

        // If the existing value is different, it means there's already another leader
        changeState(LeaderElectionState.Following);
        return CompletableFuture.completedFuture(LeaderElectionState.Following);
    }

    private synchronized void changeState(LeaderElectionState les) {
        internalState = InternalState.LeaderIsPresent;
        if (this.leaderElectionState != les) {
            this.leaderElectionState = les;
            try {
                stateChangesListener.accept(leaderElectionState);
            } catch (Throwable t) {
                log.warn("Exception in state change listener", t);
            }
        }
    }

    private synchronized CompletableFuture tryToBecomeLeader() {
        T value = proposedValue.get();
        byte[] payload;
        try {
            payload = serde.serialize(path, value);
        } catch (Throwable t) {
            return FutureUtils.exception(t);
        }

        CompletableFuture result = new CompletableFuture<>();
        store.put(path, payload, Optional.of(-1L), EnumSet.of(CreateOption.Ephemeral))
                .thenAccept(stat -> {
                    synchronized (LeaderElectionImpl.this) {
                        if (internalState == InternalState.ElectionInProgress) {
                            // Do a get() in order to force a notification later, if the z-node disappears
                            cache.get(path)
                                    .thenRun(() -> {
                                        synchronized (LeaderElectionImpl.this) {
                                            log.info("Acquired leadership on {} with {}", path, value);
                                            internalState = InternalState.LeaderIsPresent;
                                            if (leaderElectionState != LeaderElectionState.Leading) {
                                                leaderElectionState = LeaderElectionState.Leading;
                                                try {
                                                    stateChangesListener.accept(leaderElectionState);
                                                } catch (Throwable t) {
                                                    log.warn("Exception in state change listener", t);
                                                }
                                            }
                                            result.complete(leaderElectionState);
                                        }
                                    }).exceptionally(ex -> {
                                        // We fail to do the get(), so clean up the leader election fail the whole
                                        // operation
                                        log.warn("Failed to get the current state after acquiring leadership on {}. "
                                                + " Conditionally deleting current entry.", path, ex);
                                        store.delete(path, Optional.of(stat.getVersion()))
                                                .thenRun(() -> result.completeExceptionally(ex))
                                                .exceptionally(ex2 -> {
                                                    result.completeExceptionally(ex2);
                                                    return null;
                                                });
                                        return null;
                                    });
                        } else {
                            log.info("Leadership on {} with value {} was lost. "
                                            + "Conditionally deleting entry with stat={}.", path, value, stat);
                            // LeaderElection was closed in between. Release the lock asynchronously
                            store.delete(path, Optional.of(stat.getVersion()))
                                    .thenRun(() -> result.completeExceptionally(
                                            new AlreadyClosedException("The leader election was already closed")))
                                    .exceptionally(ex -> {
                                        result.completeExceptionally(ex);
                                        return null;
                                    });
                        }
                    }
                }).exceptionally(ex -> {
                    if (ex.getCause() instanceof BadVersionException) {
                        // There was a conflict between 2 participants trying to become leaders at same time. Retry
                        // to fetch info on new leader.
                        log.info("There was a conflict between 2 participants trying to become leaders at the same "
                                        + "time on {}. Attempted with value {}. Retrying.",
                                path, value);
                        elect()
                            .thenAccept(lse -> result.complete(lse))
                            .exceptionally(ex2 -> {
                                result.completeExceptionally(ex2);
                                return null;
                            });
                    } else {
                        result.completeExceptionally(ex.getCause());
                    }
                    return null;
                });

        return result;
    }

    @Override
    public void close() throws Exception {
        updateCachedValueFuture.cancel(true);
        try {
            asyncClose().join();
        } catch (CompletionException e) {
            throw MetadataStoreException.unwrap(e);
        }
    }

    @Override
    public synchronized CompletableFuture asyncClose() {
        if (internalState == InternalState.Closed) {
            return CompletableFuture.completedFuture(null);
        }

        internalState = InternalState.Closed;

        if (leaderElectionState != LeaderElectionState.Leading) {
            return CompletableFuture.completedFuture(null);
        }

        return store.delete(path, version)
                .thenAccept(__ -> {
                            synchronized (LeaderElectionImpl.this) {
                                leaderElectionState = LeaderElectionState.NoLeader;
                            }
                        }
                );
    }

    @Override
    public synchronized LeaderElectionState getState() {
        return leaderElectionState;
    }

    @Override
    public CompletableFuture> getLeaderValue() {
        return cache.get(path);
    }

    @Override
    public Optional getLeaderValueIfPresent() {
        return cache.getIfCached(path);
    }

    private void handleSessionNotification(SessionEvent event) {
        // Ensure we're only processing one session event at a time.
        sequencer.sequential(() -> FutureUtil.composeAsync(() -> {
            if (event == SessionEvent.Reconnected || event == SessionEvent.SessionReestablished) {
                log.info("Revalidating leadership for {}, event:{}", path, event);
                return elect().thenAccept(leaderState -> {
                    log.info("Resynced leadership for {} - State: {}", path, leaderState);
                }).exceptionally(ex -> {
                    log.warn("Failure when processing session event", ex);
                    return null;
                });
            }
            return CompletableFuture.completedFuture(null);
        }, executor));
    }

    private void handlePathNotification(Notification notification) {
        if (!path.equals(notification.getPath())) {
            // Ignore notifications we don't care about
            return;
        }

        synchronized (this) {
            if (internalState != InternalState.LeaderIsPresent) {
                // Ignoring notification since we're not trying to become leader
                return;
            }

            if (notification.getType() == NotificationType.Deleted) {
                if (leaderElectionState == LeaderElectionState.Leading) {
                    // We've lost the leadership, switch to follower mode
                    log.warn("Leadership released for {}", path);
                }

                leaderElectionState = LeaderElectionState.NoLeader;

                if (proposedValue.isPresent()) {
                    elect()
                            .exceptionally(ex -> {
                                log.warn("Leader election for path {} has failed", path, ex);
                                synchronized (LeaderElectionImpl.this) {
                                    try {
                                        stateChangesListener.accept(leaderElectionState);
                                    } catch (Throwable t) {
                                        log.warn("Exception in state change listener", t);
                                    }

                                    if (internalState != InternalState.Closed) {
                                        executor.schedule(() -> {
                                            log.info("Retrying Leader election for path {}", path);
                                            elect();
                                        }, LEADER_ELECTION_RETRY_DELAY_SECONDS, TimeUnit.SECONDS);
                                    }
                                }
                                return null;
                            });
                }
            }
        }
    }

    @VisibleForTesting
    protected ScheduledExecutorService getSchedulerExecutor() {
        return executor;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy