All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.indices.recovery.RecoverySettings Maven / Gradle / Ivy

There is a newer version: 8.15.1
Show newest version
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */

package org.elasticsearch.indices.recovery;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.store.RateLimiter;
import org.apache.lucene.store.RateLimiter.SimpleRateLimiter;
import org.elasticsearch.TransportVersion;
import org.elasticsearch.TransportVersions;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Setting.Property;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.util.concurrent.AdjustableSemaphore;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.monitor.os.OsProbe;
import org.elasticsearch.node.NodeRoleSettings;

import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;

import static org.elasticsearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING;
import static org.elasticsearch.common.settings.Setting.parseInt;
import static org.elasticsearch.common.unit.ByteSizeValue.ofBytes;
import static org.elasticsearch.core.Strings.format;
import static org.elasticsearch.node.NodeRoleSettings.NODE_ROLES_SETTING;

public class RecoverySettings {
    public static final IndexVersion SNAPSHOT_RECOVERIES_SUPPORTED_INDEX_VERSION = IndexVersions.V_7_15_0;
    public static final TransportVersion SNAPSHOT_RECOVERIES_SUPPORTED_TRANSPORT_VERSION = TransportVersions.V_7_15_0;
    public static final IndexVersion SEQ_NO_SNAPSHOT_RECOVERIES_SUPPORTED_VERSION = IndexVersions.V_7_16_0;
    public static final TransportVersion SNAPSHOT_FILE_DOWNLOAD_THROTTLING_SUPPORTED_TRANSPORT_VERSION = TransportVersions.V_7_16_0;

    private static final Logger logger = LogManager.getLogger(RecoverySettings.class);

    /**
     * Undocumented setting, used to override the total physical available memory in tests
     **/
    // package private for tests
    static final Setting TOTAL_PHYSICAL_MEMORY_OVERRIDING_TEST_SETTING = Setting.byteSizeSetting(
        "recovery_settings.total_physical_memory_override",
        settings -> ByteSizeValue.ofBytes(OsProbe.getInstance().getTotalPhysicalMemorySize()).getStringRep(),
        Property.NodeScope
    );

    /**
     * Disk's write bandwidth allocated for this node. This bandwidth is expressed for write operations that have the default block size of
     * {@link #DEFAULT_CHUNK_SIZE}.
     */
    public static final Setting NODE_BANDWIDTH_RECOVERY_DISK_WRITE_SETTING = bandwidthSetting(
        "node.bandwidth.recovery.disk.write"
    );

    /**
     * Disk's read bandwidth allocated for this node. This bandwidth is expressed for read operations that have the default block size of
     * {@link #DEFAULT_CHUNK_SIZE}.
     */
    public static final Setting NODE_BANDWIDTH_RECOVERY_DISK_READ_SETTING = bandwidthSetting(
        "node.bandwidth.recovery.disk.read"
    );

    /**
     * Network's read bandwidth allocated for this node.
     */
    public static final Setting NODE_BANDWIDTH_RECOVERY_NETWORK_SETTING = bandwidthSetting(
        "node.bandwidth.recovery.network"
    );

    static final double DEFAULT_FACTOR_VALUE = 0.4d;

    /**
     * Default factor as defined by the operator.
     */
    public static final Setting NODE_BANDWIDTH_RECOVERY_OPERATOR_FACTOR_SETTING = operatorFactorSetting(
        "node.bandwidth.recovery.operator.factor",
        DEFAULT_FACTOR_VALUE
    );

    public static final Setting NODE_BANDWIDTH_RECOVERY_OPERATOR_FACTOR_WRITE_SETTING = operatorFactorSetting(
        "node.bandwidth.recovery.operator.factor.write"
    );

    public static final Setting NODE_BANDWIDTH_RECOVERY_OPERATOR_FACTOR_READ_SETTING = operatorFactorSetting(
        "node.bandwidth.recovery.operator.factor.read"
    );

    public static final Setting NODE_BANDWIDTH_RECOVERY_OPERATOR_FACTOR_MAX_OVERCOMMIT_SETTING = Setting.doubleSetting(
        "node.bandwidth.recovery.operator.factor.max_overcommit",
        100d, // high default overcommit
        1d,
        Double.MAX_VALUE,
        Property.NodeScope,
        Property.OperatorDynamic
    );

    public static final Setting NODE_BANDWIDTH_RECOVERY_FACTOR_WRITE_SETTING = factorSetting(
        "node.bandwidth.recovery.factor.write",
        NODE_BANDWIDTH_RECOVERY_OPERATOR_FACTOR_WRITE_SETTING
    );

    public static final Setting NODE_BANDWIDTH_RECOVERY_FACTOR_READ_SETTING = factorSetting(
        "node.bandwidth.recovery.factor.read",
        NODE_BANDWIDTH_RECOVERY_OPERATOR_FACTOR_READ_SETTING
    );

    static final List> NODE_BANDWIDTH_RECOVERY_SETTINGS = List.of(
        NODE_BANDWIDTH_RECOVERY_NETWORK_SETTING,
        NODE_BANDWIDTH_RECOVERY_DISK_READ_SETTING,
        NODE_BANDWIDTH_RECOVERY_DISK_WRITE_SETTING
    );

    /**
     * Bandwidth settings have a default value of -1 (meaning that they are undefined) or a value in (0, Long.MAX_VALUE).
     */
    private static Setting bandwidthSetting(String key) {
        return new Setting<>(key, ByteSizeValue.MINUS_ONE.getStringRep(), s -> {
            final ByteSizeValue value = ByteSizeValue.parseBytesSizeValue(s, key);
            if (ByteSizeValue.MINUS_ONE.equals(value)) {
                return value;
            }
            if (value.getBytes() <= 0L) {
                throw new IllegalArgumentException(
                    "Failed to parse value ["
                        + s
                        + "] for bandwidth setting ["
                        + key
                        + "], must be > ["
                        + ByteSizeValue.ZERO.getStringRep()
                        + ']'
                );
            }
            if (value.getBytes() >= Long.MAX_VALUE) {
                throw new IllegalArgumentException(
                    "Failed to parse value ["
                        + s
                        + "] for bandwidth setting ["
                        + key
                        + "], must be < ["
                        + ByteSizeValue.ofBytes(Long.MAX_VALUE).getStringRep()
                        + ']'
                );
            }
            return value;
        }, Property.NodeScope);
    }

    /**
     * Operator-defined factors have a value in (0.0, 1.0]
     */
    private static Setting operatorFactorSetting(String key, double defaultValue) {
        return new Setting<>(key, Double.toString(defaultValue), s -> Setting.parseDouble(s, 0d, 1d, key), v -> {
            if (v == 0d) {
                throw new IllegalArgumentException("Failed to validate value [" + v + "] for factor setting [" + key + "] must be > [0]");
            }
        }, Property.NodeScope, Property.OperatorDynamic);
    }

    private static Setting operatorFactorSetting(String key) {
        return new Setting<>(key, NODE_BANDWIDTH_RECOVERY_OPERATOR_FACTOR_SETTING, s -> Setting.parseDouble(s, 0d, 1d, key), v -> {
            if (v == 0d) {
                throw new IllegalArgumentException("Failed to validate value [" + v + "] for factor setting [" + key + "] must be > [0]");
            }
        }, Property.NodeScope, Property.OperatorDynamic);
    }

    /**
     * User-defined factors have a value in (0.0, 1.0] and fall back to a corresponding operator factor setting.
     */
    private static Setting factorSetting(String key, Setting operatorFallback) {
        return new Setting<>(key, operatorFallback, s -> Setting.parseDouble(s, 0d, 1d, key), v -> {
            if (v == 0d) {
                throw new IllegalArgumentException("Failed to validate value [" + v + "] for factor setting [" + key + "] must be > [0]");
            }
        }, Property.NodeScope, Property.Dynamic);
    }

    static final ByteSizeValue DEFAULT_MAX_BYTES_PER_SEC = new ByteSizeValue(40L, ByteSizeUnit.MB);

    public static final Setting INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING = Setting.byteSizeSetting(
        "indices.recovery.max_bytes_per_sec",
        s -> {
            final List roles = NodeRoleSettings.NODE_ROLES_SETTING.get(s);
            final List dataRoles = roles.stream().filter(DiscoveryNodeRole::canContainData).toList();
            if (dataRoles.isEmpty()) {
                // if the node is not a data node, this value doesn't matter, use the default
                return DEFAULT_MAX_BYTES_PER_SEC.getStringRep();
            }
            if (dataRoles.stream()
                .allMatch(
                    dn -> dn.equals(DiscoveryNodeRole.DATA_COLD_NODE_ROLE) || dn.equals(DiscoveryNodeRole.DATA_FROZEN_NODE_ROLE)
                ) == false) {
                // the node is not a dedicated cold and/or frozen node, use the default
                return DEFAULT_MAX_BYTES_PER_SEC.getStringRep();
            }
            /*
             * Now we are looking at a node that has a single data role, that data role is the cold data role, and the node does not
             * have the master role. In this case, we are going to set the recovery size as a function of the memory size. We are making
             * an assumption here that the size of the instance is correlated with I/O resources. That is we are assuming that the
             * larger the instance, the more disk and networking capacity it has available.
             */
            final ByteSizeValue totalPhysicalMemory = TOTAL_PHYSICAL_MEMORY_OVERRIDING_TEST_SETTING.get(s);
            final ByteSizeValue maxBytesPerSec;
            if (totalPhysicalMemory.compareTo(new ByteSizeValue(4, ByteSizeUnit.GB)) <= 0) {
                maxBytesPerSec = new ByteSizeValue(40, ByteSizeUnit.MB);
            } else if (totalPhysicalMemory.compareTo(new ByteSizeValue(8, ByteSizeUnit.GB)) <= 0) {
                maxBytesPerSec = new ByteSizeValue(60, ByteSizeUnit.MB);
            } else if (totalPhysicalMemory.compareTo(new ByteSizeValue(16, ByteSizeUnit.GB)) <= 0) {
                maxBytesPerSec = new ByteSizeValue(90, ByteSizeUnit.MB);
            } else if (totalPhysicalMemory.compareTo(new ByteSizeValue(32, ByteSizeUnit.GB)) <= 0) {
                maxBytesPerSec = new ByteSizeValue(125, ByteSizeUnit.MB);
            } else {
                maxBytesPerSec = new ByteSizeValue(250, ByteSizeUnit.MB);
            }
            return maxBytesPerSec.getStringRep();
        },
        Property.Dynamic,
        Property.NodeScope
    );

    /**
     * Controls the maximum number of file chunk requests that can be sent concurrently from the source node to the target node.
     */
    public static final Setting INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING = Setting.intSetting(
        "indices.recovery.max_concurrent_file_chunks",
        2,
        1,
        8,
        Property.Dynamic,
        Property.NodeScope
    );

    /**
     * Controls the maximum number of operation chunk requests that can be sent concurrently from the source node to the target node.
     */
    public static final Setting INDICES_RECOVERY_MAX_CONCURRENT_OPERATIONS_SETTING = Setting.intSetting(
        "indices.recovery.max_concurrent_operations",
        1,
        1,
        4,
        Property.Dynamic,
        Property.NodeScope
    );

    /**
     * how long to wait before retrying after issues cause by cluster state syncing between nodes
     * i.e., local node is not yet known on remote node, remote shard not yet started etc.
     */
    public static final Setting INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING = Setting.positiveTimeSetting(
        "indices.recovery.retry_delay_state_sync",
        TimeValue.timeValueMillis(500),
        Property.Dynamic,
        Property.NodeScope
    );

    /** how long to wait before retrying after network related issues */
    public static final Setting INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING = Setting.positiveTimeSetting(
        "indices.recovery.retry_delay_network",
        TimeValue.timeValueSeconds(5),
        Property.Dynamic,
        Property.NodeScope
    );

    /** timeout value to use for requests made as part of the recovery process */
    public static final Setting INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING = Setting.positiveTimeSetting(
        "indices.recovery.internal_action_timeout",
        TimeValue.timeValueMinutes(15),
        Property.Dynamic,
        Property.NodeScope
    );

    /** timeout value to use for the retrying of requests made as part of the recovery process */
    public static final Setting INDICES_RECOVERY_INTERNAL_ACTION_RETRY_TIMEOUT_SETTING = Setting.positiveTimeSetting(
        "indices.recovery.internal_action_retry_timeout",
        TimeValue.timeValueMinutes(1),
        Property.Dynamic,
        Property.NodeScope
    );

    /**
     * timeout value to use for requests made as part of the recovery process that are expected to take long time.
     * defaults to twice `indices.recovery.internal_action_timeout`.
     */
    public static final Setting INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING = Setting.timeSetting(
        "indices.recovery.internal_action_long_timeout",
        (s) -> TimeValue.timeValueMillis(INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING.get(s).millis() * 2),
        TimeValue.timeValueSeconds(0),
        Property.Dynamic,
        Property.NodeScope
    );

    /**
     * recoveries that don't show any activity for more then this interval will be failed.
     * defaults to `indices.recovery.internal_action_long_timeout`
     */
    public static final Setting INDICES_RECOVERY_ACTIVITY_TIMEOUT_SETTING = Setting.timeSetting(
        "indices.recovery.recovery_activity_timeout",
        INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING::get,
        TimeValue.timeValueSeconds(0),
        Property.Dynamic,
        Property.NodeScope
    );

    /**
     * recoveries would try to use files from available snapshots instead of sending them from the source node.
     * defaults to `true`
     */
    public static final Setting INDICES_RECOVERY_USE_SNAPSHOTS_SETTING = Setting.boolSetting(
        "indices.recovery.use_snapshots",
        true,
        Property.Dynamic,
        Property.NodeScope
    );

    public static final Setting INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS = Setting.intSetting(
        "indices.recovery.max_concurrent_snapshot_file_downloads",
        5,
        1,
        20,
        Property.Dynamic,
        Property.NodeScope
    );

    public static final Setting INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS_PER_NODE = new Setting<>(
        "indices.recovery.max_concurrent_snapshot_file_downloads_per_node",
        "25",
        (s) -> parseInt(s, 1, 25, "indices.recovery.max_concurrent_snapshot_file_downloads_per_node", false),
        new Setting.Validator<>() {
            private final Collection> dependencies = Collections.singletonList(
                INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS
            );

            @Override
            public void validate(Integer value) {
                // ignore
            }

            @Override
            public void validate(Integer maxConcurrentSnapshotFileDownloadsPerNode, Map, Object> settings) {
                int maxConcurrentSnapshotFileDownloads = (int) settings.get(INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS);
                if (maxConcurrentSnapshotFileDownloadsPerNode < maxConcurrentSnapshotFileDownloads) {
                    throw new IllegalArgumentException(
                        String.format(
                            Locale.ROOT,
                            "[%s]=%d is less than [%s]=%d",
                            INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS_PER_NODE.getKey(),
                            maxConcurrentSnapshotFileDownloadsPerNode,
                            INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS.getKey(),
                            maxConcurrentSnapshotFileDownloads
                        )
                    );
                }
            }

            @Override
            public Iterator> settings() {
                return dependencies.iterator();
            }
        },
        Setting.Property.Dynamic,
        Setting.Property.NodeScope
    );

    public static final ByteSizeValue DEFAULT_CHUNK_SIZE = new ByteSizeValue(512, ByteSizeUnit.KB);

    private volatile ByteSizeValue maxBytesPerSec;
    private volatile int maxConcurrentFileChunks;
    private volatile int maxConcurrentOperations;
    private volatile SimpleRateLimiter rateLimiter;
    private volatile TimeValue retryDelayStateSync;
    private volatile TimeValue retryDelayNetwork;
    private volatile TimeValue activityTimeout;
    private volatile TimeValue internalActionTimeout;
    private volatile TimeValue internalActionRetryTimeout;
    private volatile TimeValue internalActionLongTimeout;
    private volatile boolean useSnapshotsDuringRecovery;
    private final boolean nodeBandwidthSettingsExist;
    private volatile int maxConcurrentSnapshotFileDownloads;
    private volatile int maxConcurrentSnapshotFileDownloadsPerNode;
    private volatile int maxConcurrentIncomingRecoveries;

    private final AdjustableSemaphore maxSnapshotFileDownloadsPerNodeSemaphore;

    private volatile ByteSizeValue chunkSize = DEFAULT_CHUNK_SIZE;

    private final ByteSizeValue availableNetworkBandwidth;
    private final ByteSizeValue availableDiskReadBandwidth;
    private final ByteSizeValue availableDiskWriteBandwidth;

    @SuppressWarnings("this-escape")
    public RecoverySettings(Settings settings, ClusterSettings clusterSettings) {
        this.retryDelayStateSync = INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING.get(settings);
        this.maxConcurrentFileChunks = INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING.get(settings);
        this.maxConcurrentOperations = INDICES_RECOVERY_MAX_CONCURRENT_OPERATIONS_SETTING.get(settings);
        // doesn't have to be fast as nodes are reconnected every 10s by default (see InternalClusterService.ReconnectToNodes)
        // and we want to give the master time to remove a faulty node
        this.retryDelayNetwork = INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING.get(settings);

        this.internalActionTimeout = INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING.get(settings);
        this.internalActionRetryTimeout = INDICES_RECOVERY_INTERNAL_ACTION_RETRY_TIMEOUT_SETTING.get(settings);
        this.internalActionLongTimeout = INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING.get(settings);
        this.activityTimeout = INDICES_RECOVERY_ACTIVITY_TIMEOUT_SETTING.get(settings);
        this.useSnapshotsDuringRecovery = INDICES_RECOVERY_USE_SNAPSHOTS_SETTING.get(settings);
        this.maxConcurrentSnapshotFileDownloads = INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS.get(settings);
        this.maxConcurrentSnapshotFileDownloadsPerNode = INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS_PER_NODE.get(settings);
        this.maxConcurrentIncomingRecoveries = CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.get(settings);
        this.maxSnapshotFileDownloadsPerNodeSemaphore = new AdjustableSemaphore(this.maxConcurrentSnapshotFileDownloadsPerNode, true);
        this.availableNetworkBandwidth = NODE_BANDWIDTH_RECOVERY_NETWORK_SETTING.get(settings);
        this.availableDiskReadBandwidth = NODE_BANDWIDTH_RECOVERY_DISK_READ_SETTING.get(settings);
        this.availableDiskWriteBandwidth = NODE_BANDWIDTH_RECOVERY_DISK_WRITE_SETTING.get(settings);
        validateNodeBandwidthRecoverySettings(settings);
        this.nodeBandwidthSettingsExist = hasNodeBandwidthRecoverySettings(settings);
        computeMaxBytesPerSec(settings);
        if (DiscoveryNode.canContainData(settings)) {
            clusterSettings.addSettingsUpdateConsumer(
                this::computeMaxBytesPerSec,
                List.of(
                    INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING,
                    NODE_BANDWIDTH_RECOVERY_FACTOR_READ_SETTING,
                    NODE_BANDWIDTH_RECOVERY_FACTOR_WRITE_SETTING,
                    NODE_BANDWIDTH_RECOVERY_OPERATOR_FACTOR_SETTING,
                    NODE_BANDWIDTH_RECOVERY_OPERATOR_FACTOR_READ_SETTING,
                    NODE_BANDWIDTH_RECOVERY_OPERATOR_FACTOR_WRITE_SETTING,
                    NODE_BANDWIDTH_RECOVERY_OPERATOR_FACTOR_MAX_OVERCOMMIT_SETTING,
                    // non dynamic settings but they are used to update max bytes per sec
                    NODE_BANDWIDTH_RECOVERY_DISK_WRITE_SETTING,
                    NODE_BANDWIDTH_RECOVERY_DISK_READ_SETTING,
                    NODE_BANDWIDTH_RECOVERY_NETWORK_SETTING,
                    NODE_ROLES_SETTING
                )
            );
        }
        clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING, this::setMaxConcurrentFileChunks);
        clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_MAX_CONCURRENT_OPERATIONS_SETTING, this::setMaxConcurrentOperations);
        clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING, this::setRetryDelayStateSync);
        clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING, this::setRetryDelayNetwork);
        clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING, this::setInternalActionTimeout);
        clusterSettings.addSettingsUpdateConsumer(
            INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING,
            this::setInternalActionLongTimeout
        );
        clusterSettings.addSettingsUpdateConsumer(
            INDICES_RECOVERY_INTERNAL_ACTION_RETRY_TIMEOUT_SETTING,
            this::setInternalActionRetryTimeout
        );
        clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_ACTIVITY_TIMEOUT_SETTING, this::setActivityTimeout);
        clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_USE_SNAPSHOTS_SETTING, this::setUseSnapshotsDuringRecovery);
        clusterSettings.addSettingsUpdateConsumer(
            INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS,
            this::setMaxConcurrentSnapshotFileDownloads
        );
        clusterSettings.addSettingsUpdateConsumer(
            INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS_PER_NODE,
            this::setMaxConcurrentSnapshotFileDownloadsPerNode
        );
        clusterSettings.addSettingsUpdateConsumer(
            CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING,
            this::setMaxConcurrentIncomingRecoveries
        );
    }

    private void computeMaxBytesPerSec(Settings settings) {
        // limit as computed before 8.1.0
        final long defaultBytesPerSec = Math.max(INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING.get(settings).getBytes(), 0L);

        // available network bandwidth
        final long networkBandwidthBytesPerSec = Math.max(availableNetworkBandwidth.getBytes(), 0L);

        // read bandwidth
        final long readBytesPerSec;
        if (availableDiskReadBandwidth.getBytes() > 0L && networkBandwidthBytesPerSec > 0L) {
            double readFactor = NODE_BANDWIDTH_RECOVERY_FACTOR_READ_SETTING.get(settings);
            readBytesPerSec = Math.round(Math.min(availableDiskReadBandwidth.getBytes(), networkBandwidthBytesPerSec) * readFactor);
        } else {
            readBytesPerSec = 0L;
        }

        // write bandwidth
        final long writeBytesPerSec;
        if (availableDiskWriteBandwidth.getBytes() > 0L && networkBandwidthBytesPerSec > 0L) {
            double writeFactor = NODE_BANDWIDTH_RECOVERY_FACTOR_WRITE_SETTING.get(settings);
            writeBytesPerSec = Math.round(Math.min(availableDiskWriteBandwidth.getBytes(), networkBandwidthBytesPerSec) * writeFactor);
        } else {
            writeBytesPerSec = 0L;
        }

        final long availableBytesPerSec = Math.min(readBytesPerSec, writeBytesPerSec);
        assert nodeBandwidthSettingsExist == (availableBytesPerSec != 0L);

        long maxBytesPerSec;
        if (availableBytesPerSec == 0L                                      // no node recovery bandwidths
            || INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING.exists(settings)  // when set this setting overrides node recovery bandwidths
            || DiscoveryNode.canContainData(settings) == false) {           // keep previous behavior for non data nodes
            maxBytesPerSec = defaultBytesPerSec;
        } else {
            maxBytesPerSec = Math.max(defaultBytesPerSec, availableBytesPerSec);
        }

        final long maxAllowedBytesPerSec = Math.round(
            Math.max(
                Math.min(
                    Math.min(availableDiskReadBandwidth.getBytes(), availableDiskWriteBandwidth.getBytes()),
                    networkBandwidthBytesPerSec
                ),
                0L
            ) * NODE_BANDWIDTH_RECOVERY_OPERATOR_FACTOR_MAX_OVERCOMMIT_SETTING.get(settings)
        );

        ByteSizeValue finalMaxBytesPerSec;
        if (maxAllowedBytesPerSec > 0L) {
            if (maxBytesPerSec > 0L) {
                finalMaxBytesPerSec = ByteSizeValue.ofBytes(Math.min(maxBytesPerSec, maxAllowedBytesPerSec));
            } else {
                finalMaxBytesPerSec = ByteSizeValue.ofBytes(maxAllowedBytesPerSec);
            }
        } else {
            finalMaxBytesPerSec = ByteSizeValue.ofBytes(maxBytesPerSec);
        }
        logger.info(
            () -> format(
                "using rate limit [%s] with [default=%s, read=%s, write=%s, max=%s]",
                finalMaxBytesPerSec,
                ofBytes(defaultBytesPerSec),
                ofBytes(readBytesPerSec),
                ofBytes(writeBytesPerSec),
                ofBytes(maxAllowedBytesPerSec)
            )
        );
        setMaxBytesPerSec(finalMaxBytesPerSec);
    }

    public RateLimiter rateLimiter() {
        return rateLimiter;
    }

    public TimeValue retryDelayNetwork() {
        return retryDelayNetwork;
    }

    public TimeValue retryDelayStateSync() {
        return retryDelayStateSync;
    }

    public TimeValue activityTimeout() {
        return activityTimeout;
    }

    public TimeValue internalActionTimeout() {
        return internalActionTimeout;
    }

    public TimeValue internalActionRetryTimeout() {
        return internalActionRetryTimeout;
    }

    public TimeValue internalActionLongTimeout() {
        return internalActionLongTimeout;
    }

    public ByteSizeValue getChunkSize() {
        return chunkSize;
    }

    public void setChunkSize(ByteSizeValue chunkSize) { // only settable for tests
        if (chunkSize.bytesAsInt() <= 0) {
            throw new IllegalArgumentException("chunkSize must be > 0");
        }
        this.chunkSize = chunkSize;
    }

    public void setRetryDelayStateSync(TimeValue retryDelayStateSync) {
        this.retryDelayStateSync = retryDelayStateSync;
    }

    public void setRetryDelayNetwork(TimeValue retryDelayNetwork) {
        this.retryDelayNetwork = retryDelayNetwork;
    }

    public void setActivityTimeout(TimeValue activityTimeout) {
        this.activityTimeout = activityTimeout;
    }

    public void setInternalActionTimeout(TimeValue internalActionTimeout) {
        this.internalActionTimeout = internalActionTimeout;
    }

    public void setInternalActionLongTimeout(TimeValue internalActionLongTimeout) {
        this.internalActionLongTimeout = internalActionLongTimeout;
    }

    public void setInternalActionRetryTimeout(TimeValue internalActionRetryTimeout) {
        this.internalActionRetryTimeout = internalActionRetryTimeout;
    }

    private void setMaxBytesPerSec(ByteSizeValue maxBytesPerSec) {
        this.maxBytesPerSec = maxBytesPerSec;
        if (maxBytesPerSec.getBytes() <= 0) {
            rateLimiter = null;
        } else if (rateLimiter != null) {
            rateLimiter.setMBPerSec(maxBytesPerSec.getMbFrac());
        } else {
            rateLimiter = new SimpleRateLimiter(maxBytesPerSec.getMbFrac());
        }
    }

    public ByteSizeValue getMaxBytesPerSec() {
        return maxBytesPerSec;
    }

    public int getMaxConcurrentFileChunks() {
        return maxConcurrentFileChunks;
    }

    private void setMaxConcurrentFileChunks(int maxConcurrentFileChunks) {
        this.maxConcurrentFileChunks = maxConcurrentFileChunks;
    }

    public int getMaxConcurrentOperations() {
        return maxConcurrentOperations;
    }

    private void setMaxConcurrentOperations(int maxConcurrentOperations) {
        this.maxConcurrentOperations = maxConcurrentOperations;
    }

    public boolean nodeBandwidthSettingsExist() {
        return nodeBandwidthSettingsExist;
    }

    public boolean getUseSnapshotsDuringRecovery() {
        return useSnapshotsDuringRecovery;
    }

    private void setUseSnapshotsDuringRecovery(boolean useSnapshotsDuringRecovery) {
        this.useSnapshotsDuringRecovery = useSnapshotsDuringRecovery;
    }

    public int getMaxConcurrentSnapshotFileDownloads() {
        return maxConcurrentSnapshotFileDownloads;
    }

    public void setMaxConcurrentSnapshotFileDownloads(int maxConcurrentSnapshotFileDownloads) {
        this.maxConcurrentSnapshotFileDownloads = maxConcurrentSnapshotFileDownloads;
    }

    private void setMaxConcurrentIncomingRecoveries(int maxConcurrentIncomingRecoveries) {
        this.maxConcurrentIncomingRecoveries = maxConcurrentIncomingRecoveries;
    }

    private void setMaxConcurrentSnapshotFileDownloadsPerNode(int maxConcurrentSnapshotFileDownloadsPerNode) {
        this.maxConcurrentSnapshotFileDownloadsPerNode = maxConcurrentSnapshotFileDownloadsPerNode;
        this.maxSnapshotFileDownloadsPerNodeSemaphore.setMaxPermits(maxConcurrentSnapshotFileDownloadsPerNode);
    }

    @Nullable
    Releasable tryAcquireSnapshotDownloadPermits() {
        if (getUseSnapshotsDuringRecovery() == false) {
            return null;
        }

        final int maxConcurrentSnapshotFileDownloads = getMaxConcurrentSnapshotFileDownloads();
        final boolean permitAcquired = maxSnapshotFileDownloadsPerNodeSemaphore.tryAcquire(maxConcurrentSnapshotFileDownloads);
        if (permitAcquired == false) {
            if (this.maxConcurrentIncomingRecoveries <= this.maxConcurrentSnapshotFileDownloadsPerNode) {
                logger.warn(
                    String.format(
                        Locale.ROOT,
                        """
                            Unable to acquire permit to use snapshot files during recovery, so this recovery will recover index files from \
                            the source node. Ensure snapshot files can be used during recovery by setting [%s] to be no greater than [%d]. \
                            Current values of [%s] = [%d], [%s] = [%d]
                            """,
                        INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS.getKey(),
                        this.maxConcurrentSnapshotFileDownloadsPerNode / Math.max(1, this.maxConcurrentIncomingRecoveries),
                        INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS_PER_NODE.getKey(),
                        this.maxConcurrentSnapshotFileDownloadsPerNode,
                        CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(),
                        this.maxConcurrentIncomingRecoveries
                    )
                );
            } else {
                logger.warn(
                    String.format(
                        Locale.ROOT,
                        """
                            Unable to acquire permit to use snapshot files during recovery, so this recovery will recover index files from \
                            the source node. Ensure snapshot files can be used during recovery by reducing [%s] from its current value of \
                            [%d] to be no greater than [%d], or disable snapshot-based recovery by setting [%s] to [false]
                            """,
                        CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(),
                        this.maxConcurrentIncomingRecoveries,
                        this.maxConcurrentSnapshotFileDownloadsPerNode,
                        INDICES_RECOVERY_USE_SNAPSHOTS_SETTING.getKey()
                    )
                );
            }
            return null;
        }

        return Releasables.releaseOnce(() -> maxSnapshotFileDownloadsPerNodeSemaphore.release(maxConcurrentSnapshotFileDownloads));
    }

    private static void validateNodeBandwidthRecoverySettings(Settings settings) {
        final List nonDefaults = NODE_BANDWIDTH_RECOVERY_SETTINGS.stream()
            .filter(setting -> setting.get(settings) != ByteSizeValue.MINUS_ONE)
            .map(Setting::getKey)
            .toList();
        if (nonDefaults.isEmpty() == false && nonDefaults.size() != NODE_BANDWIDTH_RECOVERY_SETTINGS.size()) {
            throw new IllegalArgumentException(
                "Settings "
                    + NODE_BANDWIDTH_RECOVERY_SETTINGS.stream().map(Setting::getKey).toList()
                    + " must all be defined or all be undefined; but only settings "
                    + nonDefaults
                    + " are configured."
            );
        }
    }

    /**
     * Whether the node bandwidth recovery settings are set.
     */
    private static boolean hasNodeBandwidthRecoverySettings(Settings settings) {
        return NODE_BANDWIDTH_RECOVERY_SETTINGS.stream()
            .filter(setting -> setting.get(settings) != ByteSizeValue.MINUS_ONE)
            .count() == NODE_BANDWIDTH_RECOVERY_SETTINGS.size();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy