org.elasticsearch.cluster.coordination.ElectionSchedulerFactory Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.cluster.coordination;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Setting.Property;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.SuppressForbidden;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.threadpool.ThreadPool.Names;
import java.util.Random;
import java.util.concurrent.Executor;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import static org.elasticsearch.core.Strings.format;
/**
* It's provably impossible to guarantee that any leader election algorithm ever elects a leader, but they generally work (with probability
* that approaches 1 over time) as long as elections occur sufficiently infrequently, compared to the time it takes to send a message to
* another node and receive a response back. We do not know the round-trip latency here, but we can approximate it by attempting elections
* randomly at reasonably high frequency and backing off (linearly) until one of them succeeds. We also place an upper bound on the backoff
* so that if elections are failing due to a network partition that lasts for a long time then when the partition heals there is an election
* attempt reasonably quickly.
*/
public class ElectionSchedulerFactory {
private static final Logger logger = LogManager.getLogger(ElectionSchedulerFactory.class);
private static final String ELECTION_INITIAL_TIMEOUT_SETTING_KEY = "cluster.election.initial_timeout";
private static final String ELECTION_BACK_OFF_TIME_SETTING_KEY = "cluster.election.back_off_time";
private static final String ELECTION_MAX_TIMEOUT_SETTING_KEY = "cluster.election.max_timeout";
private static final String ELECTION_DURATION_SETTING_KEY = "cluster.election.duration";
/*
* The first election is scheduled to occur a random number of milliseconds after the scheduler is started, where the random number of
* milliseconds is chosen uniformly from
*
* (0, min(ELECTION_INITIAL_TIMEOUT_SETTING, ELECTION_MAX_TIMEOUT_SETTING)]
*
* For `n > 1`, the `n`th election is scheduled to occur a random number of milliseconds after the `n - 1`th election, where the random
* number of milliseconds is chosen uniformly from
*
* (0, min(ELECTION_INITIAL_TIMEOUT_SETTING + (n-1) * ELECTION_BACK_OFF_TIME_SETTING, ELECTION_MAX_TIMEOUT_SETTING)]
*
* Each election lasts up to ELECTION_DURATION_SETTING.
*/
public static final Setting ELECTION_INITIAL_TIMEOUT_SETTING = Setting.timeSetting(
ELECTION_INITIAL_TIMEOUT_SETTING_KEY,
TimeValue.timeValueMillis(100),
TimeValue.timeValueMillis(1),
TimeValue.timeValueSeconds(10),
Property.NodeScope
);
public static final Setting ELECTION_BACK_OFF_TIME_SETTING = Setting.timeSetting(
ELECTION_BACK_OFF_TIME_SETTING_KEY,
TimeValue.timeValueMillis(100),
TimeValue.timeValueMillis(1),
TimeValue.timeValueSeconds(60),
Property.NodeScope
);
public static final Setting ELECTION_MAX_TIMEOUT_SETTING = Setting.timeSetting(
ELECTION_MAX_TIMEOUT_SETTING_KEY,
TimeValue.timeValueSeconds(10),
TimeValue.timeValueMillis(200),
TimeValue.timeValueSeconds(600),
Property.NodeScope
);
public static final Setting ELECTION_DURATION_SETTING = Setting.timeSetting(
ELECTION_DURATION_SETTING_KEY,
TimeValue.timeValueMillis(500),
TimeValue.timeValueMillis(1),
TimeValue.timeValueSeconds(600),
Property.NodeScope
);
private final TimeValue initialTimeout;
private final TimeValue backoffTime;
private final TimeValue maxTimeout;
private final TimeValue duration;
private final ThreadPool threadPool;
private final Executor clusterCoordinationExecutor;
private final Random random;
public ElectionSchedulerFactory(Settings settings, Random random, ThreadPool threadPool) {
this.random = random;
this.threadPool = threadPool;
this.clusterCoordinationExecutor = threadPool.executor(Names.CLUSTER_COORDINATION);
initialTimeout = ELECTION_INITIAL_TIMEOUT_SETTING.get(settings);
backoffTime = ELECTION_BACK_OFF_TIME_SETTING.get(settings);
maxTimeout = ELECTION_MAX_TIMEOUT_SETTING.get(settings);
duration = ELECTION_DURATION_SETTING.get(settings);
if (maxTimeout.millis() < initialTimeout.millis()) {
throw new IllegalArgumentException(
format(
"[%s] is [%s], but must be at least [%s] which is [%s]",
ELECTION_MAX_TIMEOUT_SETTING_KEY,
maxTimeout,
ELECTION_INITIAL_TIMEOUT_SETTING_KEY,
initialTimeout
)
);
}
}
/**
* Start the process to schedule repeated election attempts.
*
* @param gracePeriod An initial period to wait before attempting the first election.
* @param scheduledRunnable The action to run each time an election should be attempted.
*/
public Releasable startElectionScheduler(TimeValue gracePeriod, Runnable scheduledRunnable) {
final ElectionScheduler scheduler = new ElectionScheduler();
scheduler.scheduleNextElection(gracePeriod, scheduledRunnable);
return scheduler;
}
@SuppressForbidden(reason = "Argument to Math.abs() is definitely not Long.MIN_VALUE")
private static long nonNegative(long n) {
return n == Long.MIN_VALUE ? 0 : Math.abs(n);
}
/**
* @param randomNumber a randomly-chosen long
* @param upperBound inclusive upper bound
* @return a number in the range (0, upperBound]
*/
// package-private for testing
static long toPositiveLongAtMost(long randomNumber, long upperBound) {
assert 0 < upperBound : upperBound;
return nonNegative(randomNumber) % upperBound + 1;
}
@Override
public String toString() {
return "ElectionSchedulerFactory{"
+ "initialTimeout="
+ initialTimeout
+ ", backoffTime="
+ backoffTime
+ ", maxTimeout="
+ maxTimeout
+ '}';
}
private class ElectionScheduler implements Releasable {
private final AtomicBoolean isClosed = new AtomicBoolean();
private final AtomicLong attempt = new AtomicLong();
void scheduleNextElection(final TimeValue gracePeriod, final Runnable scheduledRunnable) {
if (isClosed.get()) {
logger.debug("{} not scheduling election", this);
return;
}
final long thisAttempt = attempt.getAndIncrement();
// to overflow here would take over a million years of failed election attempts, so we won't worry about that:
final long maxDelayMillis = Math.min(maxTimeout.millis(), initialTimeout.millis() + thisAttempt * backoffTime.millis());
final long delayMillis = toPositiveLongAtMost(random.nextLong(), maxDelayMillis) + gracePeriod.millis();
final Runnable runnable = new AbstractRunnable() {
@Override
public void onRejection(Exception e) {
logger.debug("threadpool was shut down", e);
}
@Override
public void onFailure(Exception e) {
logger.debug(() -> format("unexpected exception in wakeup of %s", this), e);
assert false : e;
}
@Override
protected void doRun() {
if (isClosed.get()) {
logger.debug("{} not starting election", this);
} else {
logger.debug("{} starting election", this);
if (thisAttempt > 0 && thisAttempt % 10 == 0) {
logger.info("""
retrying master election after [{}] failed attempts; \
election attempts are currently scheduled up to [{}ms] apart""", thisAttempt, maxDelayMillis);
}
scheduleNextElection(duration, scheduledRunnable);
scheduledRunnable.run();
}
}
@Override
public String toString() {
return "scheduleNextElection{gracePeriod="
+ gracePeriod
+ ", thisAttempt="
+ thisAttempt
+ ", maxDelayMillis="
+ maxDelayMillis
+ ", delayMillis="
+ delayMillis
+ ", "
+ ElectionScheduler.this
+ "}";
}
};
logger.debug("scheduling {}", runnable);
threadPool.scheduleUnlessShuttingDown(TimeValue.timeValueMillis(delayMillis), clusterCoordinationExecutor, runnable);
}
@Override
public String toString() {
return "ElectionScheduler{attempt=" + attempt + ",isClosed=" + isClosed.get() + "," + ElectionSchedulerFactory.this + "}";
}
@Override
public void close() {
logger.trace("closing {}", this);
boolean wasNotPreviouslyClosed = isClosed.compareAndSet(false, true);
assert wasNotPreviouslyClosed;
}
}
}