All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.graylog2.system.processing.control.ClusterProcessingControl Maven / Gradle / Ivy

There is a newer version: 6.1.4
Show newest version
/*
 * Copyright (C) 2020 Graylog, Inc.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the Server Side Public License, version 1,
 * as published by MongoDB, Inc.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * Server Side Public License for more details.
 *
 * You should have received a copy of the Server Side Public License
 * along with this program. If not, see
 * .
 */
package org.graylog2.system.processing.control;

import com.github.joschi.jadconfig.util.Duration;
import com.github.rholder.retry.Attempt;
import com.github.rholder.retry.RetryException;
import com.github.rholder.retry.RetryListener;
import com.github.rholder.retry.Retryer;
import com.github.rholder.retry.RetryerBuilder;
import com.github.rholder.retry.StopStrategies;
import com.github.rholder.retry.WaitStrategies;
import org.graylog2.cluster.Node;
import org.graylog2.cluster.nodes.NodeService;
import org.graylog2.cluster.nodes.ServerNodeDto;
import org.graylog2.rest.RemoteInterfaceProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import retrofit2.Call;
import retrofit2.Response;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.stream.Collectors;

import static org.graylog2.Configuration.INSTALL_OUTPUT_BUFFER_DRAINING_INTERVAL;
import static org.graylog2.Configuration.INSTALL_OUTPUT_BUFFER_DRAINING_MAX_RETRIES;
import static org.graylog2.shared.utilities.StringUtils.f;

public class ClusterProcessingControl  {
    private final Logger LOG = LoggerFactory.getLogger(ClusterProcessingControl.class);

    private static final String OUTPUT_RATE_METRIC_NAME = "org.graylog2.throughput.output.1-sec-rate";

    protected final String authorizationToken;
    protected final RemoteInterfaceProvider remoteInterfaceProvider;
    protected final NodeService nodeService;
    protected final Duration connectionTimeout;
    private final Duration bufferDrainInterval;
    private final int maxBufferDrainRetries;

    public ClusterProcessingControl(String authorizationToken, RemoteInterfaceProvider remoteInterfaceProvider, NodeService nodeService, Duration connectionTimeout, Duration bufferDrainInterval, int maxBufferDrainRetries) {
        this.authorizationToken = authorizationToken;
        this.remoteInterfaceProvider = remoteInterfaceProvider;
        this.nodeService = nodeService;
        this.connectionTimeout = connectionTimeout;
        this.bufferDrainInterval = bufferDrainInterval;
        this.maxBufferDrainRetries = maxBufferDrainRetries;
    }

    public void pauseProcessing() {
        runOnAllActiveNodes("pause processing", RemoteProcessingControlResource::pauseProcessing, true);
    }

    protected  Map runOnAllActiveNodes(
            String operationName,
            Function> callRemoteResource,
            boolean stopOnFirstException
    ) {
        final Map result = new HashMap<>();
        final List exceptions = new ArrayList<>();
        printNodeDebugInfo();
        nodeService.allActive().entrySet().forEach(entry -> {
            final Node nodeValue = entry.getValue();
            try {
                LOG.info("Attempting to call '{}' on node [{}].", operationName, nodeValue.getNodeId());
                final Response response = getrResponse(callRemoteResource, entry);
                if (!response.isSuccessful()) {
                    final String message = f("Unable to call '%s' on node [%s] code [%s] body [%s]",
                            operationName, nodeValue.getNodeId(),
                            response.code(), response.body());
                    LOG.error("Unable to call '{}' on node [{}] code [{}] body [{}].",
                            operationName, nodeValue.getNodeId(),
                            response.code(), response.body());
                    throw new ClusterProcessingControlException(message);
                }
                result.put(entry.getKey(), response.body());
                LOG.info("Successfully called '{}' on node [{}].", operationName, nodeValue.getNodeId());
            } catch (Exception e) {
                if (e instanceof ClusterProcessingControlException) {
                    exceptions.add((ClusterProcessingControlException) e);
                } else {
                    final String message = f("Unable to call '%s' on node [%s]", operationName, nodeValue.getNodeId());
                    LOG.error(message, e);
                    exceptions.add(new ClusterProcessingControlException(message, e));
                }

                if (stopOnFirstException) {
                    throw exceptions.get(0);
                }
            }
        });

        if (!exceptions.isEmpty()) {
            throw exceptions.get(0);
        }

        return result;
    }

    protected  Response getrResponse(Function> callRemoteResource, Map.Entry entry) throws IOException {
        var remoteProcessingControlResource = remoteInterfaceProvider.get(entry.getValue(),
                this.authorizationToken, RemoteProcessingControlResource.class,
                java.time.Duration.ofSeconds(connectionTimeout.toSeconds()));
        return callRemoteResource.apply((F) remoteProcessingControlResource).execute();
    }

    public void waitForEmptyBuffers() throws OutputBufferDrainFailureException {
        printNodeDebugInfo();
        final Retryer retryer = RetryerBuilder.newBuilder()
                .retryIfResult(value -> !value.success)
                .withWaitStrategy(WaitStrategies.fixedWait(bufferDrainInterval.toSeconds(), TimeUnit.SECONDS))
                .withStopStrategy(StopStrategies.stopAfterAttempt(maxBufferDrainRetries))
                .withRetryListener(new RetryListener() {
                    @Override
                    public  void onRetry(Attempt attempt) {
                        if (attempt.getAttemptNumber() > 1) {
                            LOG.info("Checking again for empty output buffers (attempt #{}).", attempt.getAttemptNumber());
                        }
                    }
                })
                .build();
        try {
            retryer.call(() -> {
                final Map nodeOutputRateMap = runOnAllActiveNodes("fetching output rate metric value",
                        res -> res.getMetric(OUTPUT_RATE_METRIC_NAME), true)
                        .entrySet()
                        .stream()
                        .collect(Collectors.toMap(Map.Entry::getKey, entry -> (Double) entry.getValue().get("value")));
                final boolean allZero = new HashSet<>(nodeOutputRateMap.values()).stream()
                        .allMatch(this::isOutputRateCloseToZero);
                final Set nonZeroNodes = nodeOutputRateMap
                        .entrySet()
                        .stream()
                        .filter(e -> !isOutputRateCloseToZero(e.getValue()))
                        .map(Map.Entry::getKey)
                        .collect(Collectors.toSet());
                if (allZero) {
                    LOG.info("Output buffer is now empty on all nodes.");
                } else {
                    LOG.info("Output rate has not yet reached zero on nodes [{}].", nonZeroNodes);
                }
                return new NodeOperationResult(allZero, nonZeroNodes);
            });
        } catch (RetryException e) {
            final String message = f("The [%s] rate failed to reach zero on all nodes in [%s] with [%s] retries. Giving up. " +
                            "This is configurable with the [%s] and [%s] configuration properties", OUTPUT_RATE_METRIC_NAME,
                    bufferDrainInterval.toSeconds(), maxBufferDrainRetries, INSTALL_OUTPUT_BUFFER_DRAINING_INTERVAL,
                    INSTALL_OUTPUT_BUFFER_DRAINING_MAX_RETRIES);
            LOG.error(message);
            throw new OutputBufferDrainFailureException(bufferDrainInterval.toSeconds(), maxBufferDrainRetries,
                    tryGetExceptionNodes(e));
        } catch (Exception e) {
            throw new ClusterProcessingControlException("Failed to request node output rate on all nodes.", e);
        }
    }

    /**
     * Try to retrieve the nodes that have a non-zero output rate from the RetryException.
     * This should succeed with the current implementation.
     */
    protected static Set tryGetExceptionNodes(RetryException e) {
        try {
            return ((NodeOperationResult) e.getLastFailedAttempt().get()).nonZeroOutputRateNodeIds();
        } catch (ExecutionException ex) {
            return Collections.emptySet();
        }
    }

    public record NodeOperationResult(boolean success, Set nonZeroOutputRateNodeIds) {
    }

    /**
     * The output rate is the number of messages per second that are being written to OpenSearch (usually a
     * whole number followed by some meaningless decimals - e.g. 100.01 messages/second).
     * A value < 1 is effectively zero. The rate might become very small, but not zero in some cases,
     * so this method accounts for that condition.
     */
    protected boolean isOutputRateCloseToZero(double outputRate) {
        return outputRate < 0.0001;
    }

    public void resumeGraylogMessageProcessing() {
        LOG.info("Attempting to resume processing on all nodes...");
        runOnAllActiveNodes("resume processing", RemoteProcessingControlResource::resumeProcessing, false);
        LOG.info("Done resuming processing on all nodes.");
    }

    protected void printNodeDebugInfo() {
        if (LOG.isDebugEnabled()) {
            LOG.debug("The Graylog cluster contains the following nodes:");
            nodeService.allActive().entrySet().forEach((entry) -> {
                final Node node = entry.getValue();
                LOG.debug("Node ID [{}] Transport Address [{}] Last Seen [{}]", node.getNodeId(), node.getTransportAddress(), node.getLastSeen());
            });
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy