All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.registration.RetryingRegistration Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.registration;

import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.runtime.concurrent.AcceptFunction;
import org.apache.flink.runtime.concurrent.ApplyFunction;
import org.apache.flink.runtime.concurrent.CompletableFuture;
import org.apache.flink.runtime.concurrent.Future;
import org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture;
import org.apache.flink.runtime.rpc.RpcGateway;
import org.apache.flink.runtime.rpc.RpcService;

import org.slf4j.Logger;

import java.util.UUID;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import static org.apache.flink.util.Preconditions.checkArgument;
import static org.apache.flink.util.Preconditions.checkNotNull;


/**
 * This utility class implements the basis of registering one component at another component,
 * for example registering the TaskExecutor at the ResourceManager.
 * This {@code RetryingRegistration} implements both the initial address resolution
 * and the retries-with-backoff strategy.
 * 
 * 

The registration gives access to a future that is completed upon successful registration. * The registration can be canceled, for example when the target where it tries to register * at looses leader status. * * @param The type of the gateway to connect to. * @param The type of the successful registration responses. */ public abstract class RetryingRegistration { // ------------------------------------------------------------------------ // default configuration values // ------------------------------------------------------------------------ /** default value for the initial registration timeout (milliseconds) */ private static final long INITIAL_REGISTRATION_TIMEOUT_MILLIS = 100; /** default value for the maximum registration timeout, after exponential back-off (milliseconds) */ private static final long MAX_REGISTRATION_TIMEOUT_MILLIS = 30000; /** The pause (milliseconds) made after an registration attempt caused an exception (other than timeout) */ private static final long ERROR_REGISTRATION_DELAY_MILLIS = 10000; /** The pause (milliseconds) made after the registration attempt was refused */ private static final long REFUSED_REGISTRATION_DELAY_MILLIS = 30000; // ------------------------------------------------------------------------ // Fields // ------------------------------------------------------------------------ private final Logger log; private final RpcService rpcService; private final String targetName; private final Class targetType; private final String targetAddress; private final UUID leaderId; private final CompletableFuture> completionFuture; private final long initialRegistrationTimeout; private final long maxRegistrationTimeout; private final long delayOnError; private final long delayOnRefusedRegistration; private volatile boolean canceled; // ------------------------------------------------------------------------ public RetryingRegistration( Logger log, RpcService rpcService, String targetName, Class targetType, String targetAddress, UUID leaderId) { this(log, rpcService, targetName, targetType, targetAddress, leaderId, INITIAL_REGISTRATION_TIMEOUT_MILLIS, MAX_REGISTRATION_TIMEOUT_MILLIS, ERROR_REGISTRATION_DELAY_MILLIS, REFUSED_REGISTRATION_DELAY_MILLIS); } public RetryingRegistration( Logger log, RpcService rpcService, String targetName, Class targetType, String targetAddress, UUID leaderId, long initialRegistrationTimeout, long maxRegistrationTimeout, long delayOnError, long delayOnRefusedRegistration) { checkArgument(initialRegistrationTimeout > 0, "initial registration timeout must be greater than zero"); checkArgument(maxRegistrationTimeout > 0, "maximum registration timeout must be greater than zero"); checkArgument(delayOnError >= 0, "delay on error must be non-negative"); checkArgument(delayOnRefusedRegistration >= 0, "delay on refused registration must be non-negative"); this.log = checkNotNull(log); this.rpcService = checkNotNull(rpcService); this.targetName = checkNotNull(targetName); this.targetType = checkNotNull(targetType); this.targetAddress = checkNotNull(targetAddress); this.leaderId = checkNotNull(leaderId); this.initialRegistrationTimeout = initialRegistrationTimeout; this.maxRegistrationTimeout = maxRegistrationTimeout; this.delayOnError = delayOnError; this.delayOnRefusedRegistration = delayOnRefusedRegistration; this.completionFuture = new FlinkCompletableFuture<>(); } // ------------------------------------------------------------------------ // completion and cancellation // ------------------------------------------------------------------------ public Future> getFuture() { return completionFuture; } /** * Cancels the registration procedure. */ public void cancel() { canceled = true; } /** * Checks if the registration was canceled. * @return True if the registration was canceled, false otherwise. */ public boolean isCanceled() { return canceled; } // ------------------------------------------------------------------------ // registration // ------------------------------------------------------------------------ protected abstract Future invokeRegistration( Gateway gateway, UUID leaderId, long timeoutMillis) throws Exception; /** * This method resolves the target address to a callable gateway and starts the * registration after that. */ @SuppressWarnings("unchecked") public void startRegistration() { try { // trigger resolution of the resource manager address to a callable gateway Future resourceManagerFuture = rpcService.connect(targetAddress, targetType); // upon success, start the registration attempts Future resourceManagerAcceptFuture = resourceManagerFuture.thenAcceptAsync(new AcceptFunction() { @Override public void accept(Gateway result) { log.info("Resolved {} address, beginning registration", targetName); register(result, 1, initialRegistrationTimeout); } }, rpcService.getExecutor()); // upon failure, retry, unless this is cancelled resourceManagerAcceptFuture.exceptionallyAsync(new ApplyFunction() { @Override public Void apply(Throwable failure) { if (!isCanceled()) { log.warn("Could not resolve {} address {}, retrying...", targetName, targetAddress, failure); startRegistration(); } return null; } }, rpcService.getExecutor()); } catch (Throwable t) { cancel(); completionFuture.completeExceptionally(t); } } /** * This method performs a registration attempt and triggers either a success notification or a retry, * depending on the result. */ @SuppressWarnings("unchecked") private void register(final Gateway gateway, final int attempt, final long timeoutMillis) { // eager check for canceling to avoid some unnecessary work if (canceled) { return; } try { log.info("Registration at {} attempt {} (timeout={}ms)", targetName, attempt, timeoutMillis); Future registrationFuture = invokeRegistration(gateway, leaderId, timeoutMillis); // if the registration was successful, let the TaskExecutor know Future registrationAcceptFuture = registrationFuture.thenAcceptAsync(new AcceptFunction() { @Override public void accept(RegistrationResponse result) { if (!isCanceled()) { if (result instanceof RegistrationResponse.Success) { // registration successful! Success success = (Success) result; completionFuture.complete(Tuple2.of(gateway, success)); } else { // registration refused or unknown if (result instanceof RegistrationResponse.Decline) { RegistrationResponse.Decline decline = (RegistrationResponse.Decline) result; log.info("Registration at {} was declined: {}", targetName, decline.getReason()); } else { log.error("Received unknown response to registration attempt: {}", result); } log.info("Pausing and re-attempting registration in {} ms", delayOnRefusedRegistration); registerLater(gateway, 1, initialRegistrationTimeout, delayOnRefusedRegistration); } } } }, rpcService.getExecutor()); // upon failure, retry registrationAcceptFuture.exceptionallyAsync(new ApplyFunction() { @Override public Void apply(Throwable failure) { if (!isCanceled()) { if (failure instanceof TimeoutException) { // we simply have not received a response in time. maybe the timeout was // very low (initial fast registration attempts), maybe the target endpoint is // currently down. if (log.isDebugEnabled()) { log.debug("Registration at {} ({}) attempt {} timed out after {} ms", targetName, targetAddress, attempt, timeoutMillis); } long newTimeoutMillis = Math.min(2 * timeoutMillis, maxRegistrationTimeout); register(gateway, attempt + 1, newTimeoutMillis); } else { // a serious failure occurred. we still should not give up, but keep trying log.error("Registration at {} failed due to an error", targetName, failure); log.info("Pausing and re-attempting registration in {} ms", delayOnError); registerLater(gateway, 1, initialRegistrationTimeout, delayOnError); } } return null; } }, rpcService.getExecutor()); } catch (Throwable t) { cancel(); completionFuture.completeExceptionally(t); } } private void registerLater(final Gateway gateway, final int attempt, final long timeoutMillis, long delay) { rpcService.scheduleRunnable(new Runnable() { @Override public void run() { register(gateway, attempt, timeoutMillis); } }, delay, TimeUnit.MILLISECONDS); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy