org.jboss.pnc.environmentdriver.Driver Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of environment-driver Show documentation
Show all versions of environment-driver Show documentation
Environment driver provides REST endpoints to create and destroy Openshift objects
/**
* JBoss, Home of Professional Open Source.
* Copyright 2021 Red Hat, Inc., and individual contributors
* as indicated by the @author tags.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.jboss.pnc.environmentdriver;
import static org.jboss.pnc.environmentdriver.Constants.METRICS_POD_STARTED_FAILED_KEY;
import static org.jboss.pnc.environmentdriver.Constants.METRICS_POD_STARTED_FAILED_REASON_KEY;
import static org.jboss.pnc.environmentdriver.Constants.METRICS_POD_STARTED_SUCCESS_KEY;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URI;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.time.Duration;
import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionStage;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.enterprise.context.RequestScoped;
import javax.inject.Inject;
import javax.ws.rs.core.MediaType;
import io.quarkus.oidc.client.OidcClient;
import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.text.StringSubstitutor;
import org.eclipse.microprofile.context.ManagedExecutor;
import org.eclipse.microprofile.rest.client.inject.RestClient;
import org.jboss.pnc.api.constants.HttpHeaders;
import org.jboss.pnc.api.constants.MDCHeaderKeys;
import org.jboss.pnc.api.constants.MDCKeys;
import org.jboss.pnc.api.dto.Request;
import org.jboss.pnc.api.environmentdriver.dto.EnvironmentCompleteResponse;
import org.jboss.pnc.api.environmentdriver.dto.EnvironmentCreateRequest;
import org.jboss.pnc.api.environmentdriver.dto.EnvironmentCreateResponse;
import org.jboss.pnc.api.environmentdriver.dto.EnvironmentCreateResult;
import org.jboss.pnc.common.Random;
import org.jboss.pnc.common.Strings;
import org.jboss.pnc.common.log.MDCUtils;
import org.jboss.pnc.common.otel.OtelUtils;
import org.jboss.pnc.environmentdriver.clients.IndyService;
import org.jboss.pnc.environmentdriver.clients.IndyTokenRequestDTO;
import org.jboss.pnc.environmentdriver.clients.IndyTokenResponseDTO;
import org.jboss.pnc.environmentdriver.enums.PodErrorStatuses;
import org.jboss.pnc.environmentdriver.exceptions.BadResourcesRequestException;
import org.jboss.pnc.environmentdriver.exceptions.DriverException;
import org.jboss.pnc.environmentdriver.exceptions.FailedResponseException;
import org.jboss.pnc.environmentdriver.exceptions.QuotaExceededException;
import org.jboss.pnc.environmentdriver.exceptions.StoppingException;
import org.jboss.pnc.environmentdriver.exceptions.TemporarilyUnableToStartException;
import org.jboss.pnc.environmentdriver.exceptions.UnableToRequestResourcesException;
import org.jboss.pnc.environmentdriver.exceptions.UnableToStartException;
import org.jboss.pnc.environmentdriver.runtime.ApplicationLifecycle;
import org.jboss.pnc.pncmetrics.GaugeMetric;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MDC;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.exc.MismatchedInputException;
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
import io.fabric8.kubernetes.api.model.Container;
import io.fabric8.kubernetes.api.model.ContainerStatus;
import io.fabric8.kubernetes.api.model.HasMetadata;
import io.fabric8.kubernetes.api.model.Pod;
import io.fabric8.kubernetes.api.model.PodList;
import io.fabric8.kubernetes.api.model.Quantity;
import io.fabric8.kubernetes.api.model.Service;
import io.fabric8.kubernetes.api.model.ServiceList;
import io.fabric8.kubernetes.api.model.ServicePort;
import io.fabric8.kubernetes.client.KubernetesClientException;
import io.fabric8.openshift.api.model.Route;
import io.fabric8.openshift.client.OpenShiftClient;
import io.opentelemetry.instrumentation.annotations.SpanAttribute;
import io.opentelemetry.instrumentation.annotations.WithSpan;
import net.jodah.failsafe.Failsafe;
import net.jodah.failsafe.RetryPolicy;
/**
* @author Matej Lazar
*/
@RequestScoped
public class Driver {
private static final Logger logger = LoggerFactory.getLogger(Driver.class);
public static final String ARCHIVAL_SERVICE_BUILD_CONFIG_ID = "BUILD_CONFIG_ID";
public static final String ERROR_MESSAGE_INTRO = "\n\nAn error occurred while trying to create a build environment where to run the build. ";
public static final String ERROR_MESSAGE_REGISTRY = "The builder pod failed to download the builder image "
+ "(this could be due to issues with the builder images registry, or a misconfiguration of the builder image name).";
public static final String ERROR_MESSAGE_INVALID_IMAGE_NAME = "The builder pod failed to download the builder image "
+ "(the builder image has an invalid name).";
public static final String ERROR_MESSAGE_INITIALIZATION = "The builder pod failed to start "
+ "(this could be due to misconfigured or bogus init scripts, or other unknown reasons).";
public static final String ERROR_MESSAGE_TIMEOUT = " The maximum timeout has been reached. This could be due to an exhausted capacity of the underlying infrastructure "
+ "(there is no space available to create the new build environment).";
public static final String ERROR_MESSAGE_TEMPLATE_PARSE = " The builder pod failed to start because an error occured while parsing either the pod or the service template.";
public static final String ERROR_MESSAGE_EXCEEDED_QUOTA = " The maximum quota available for the build environments has been exceeded.";
@Inject
@UserLogger
Logger userLogger;
@Inject
ManagedExecutor executor;
@Inject
ObjectMapper jsonMapper;
@Inject
Configuration configuration;
@Inject
ActiveMonitors activeMonitors;
@Inject
OpenShiftClient openShiftClient;
@Inject
java.net.http.HttpClient httpClient;
@Inject
ApplicationLifecycle lifecycle;
@Inject
OidcClient oidcClient;
@RestClient
IndyService indyService;
ObjectMapper yamlMapper = new ObjectMapper(new YAMLFactory());
private Optional gaugeMetric = Optional.empty(); // TODO
/**
* Calls the Openshift API to create required resources. Method does not wait for the resources to become available,
* when the resources are available the callback to the invoker is executed. If the resource creation requests fail,
* the CompletionStage is completed exceptionally. To keep the required state minimal, the retries should be handled
* externally.
*
* @return CompletionStage which is completed when all required requests to the Openshift complete.
*/
@WithSpan()
public CompletionStage create(
@SpanAttribute(value = "environmentCreateRequest") EnvironmentCreateRequest environmentCreateRequest) {
if (lifecycle.isShuttingDown()) {
throw new StoppingException();
}
IndyTokenResponseDTO tokenResponseDTO = indyService.getAuthToken(
new IndyTokenRequestDTO(environmentCreateRequest.getRepositoryBuildContentId()),
"Bearer " + getFreshAccessToken());
String environmentId = environmentCreateRequest.getEnvironmentLabel() + "-" + Random.randString(6);
String podName = getPodName(environmentId);
String serviceName = getServiceName(environmentId);
Map podTemplateProperties = new HashMap<>();
boolean proxyActive = configuration.getProxyServer().isPresent() && configuration.getProxyPort().isPresent();
podTemplateProperties.put("image", environmentCreateRequest.getImageId());
podTemplateProperties.put("firewallAllowedDestinations", configuration.getFirewallAllowedDestinations());
podTemplateProperties
.put("allowedHttpOutgoingDestinations", configuration.getAllowedHttpOutgoingDestinations().orElse(""));
podTemplateProperties.put("isHttpActive", Boolean.toString(proxyActive).toLowerCase());
podTemplateProperties.put("proxyServer", configuration.getProxyServer().orElse(""));
podTemplateProperties.put("proxyPort", configuration.getProxyPort().orElse(""));
podTemplateProperties.put("nonProxyHosts", configuration.getNonProxyHosts().orElse(""));
podTemplateProperties.put("AProxDependencyUrl", environmentCreateRequest.getRepositoryDependencyUrl());
podTemplateProperties.put("AProxDeployUrl", environmentCreateRequest.getRepositoryDeployUrl());
podTemplateProperties.put("containerPort", configuration.getBuildAgentContainerPort());
podTemplateProperties.put("buildContentId", environmentCreateRequest.getRepositoryBuildContentId());
// TODO: use another style of token for the accessToken with Indy
podTemplateProperties.put("accessToken", tokenResponseDTO.getToken());
podTemplateProperties.put("workingDirectory", configuration.getWorkingDirectory());
if (environmentCreateRequest.isSidecarArchiveEnabled()) {
podTemplateProperties.put(ARCHIVAL_SERVICE_BUILD_CONFIG_ID, environmentCreateRequest.getBuildConfigId());
} else {
podTemplateProperties.put(ARCHIVAL_SERVICE_BUILD_CONFIG_ID, "");
}
try {
podTemplateProperties.putAll(mdcToMap());
} catch (DriverException e) {
return CompletableFuture.failedFuture(e);
}
podTemplateProperties.put(
"resourcesMemory",
builderPodMemory(configuration.getBuilderPodMemory(), environmentCreateRequest.getPodMemoryOverride()));
String buildAgentContextPath = getBuildAgentContextPath(environmentId);
podTemplateProperties.put("environment-label", environmentCreateRequest.getEnvironmentLabel());
podTemplateProperties.put("pod-name", podName);
podTemplateProperties.put("service-name", serviceName);
podTemplateProperties.put("buildAgentContextPath", buildAgentContextPath);
String sshPassword;
if (environmentCreateRequest.isAllowSshDebug()) {
sshPassword = RandomStringUtils.randomAlphanumeric(10);
podTemplateProperties.put("workerUserPassword", sshPassword);
} else {
sshPassword = "";
}
String podDefinition = configuration.getPodDefinition();
CompletableFuture podRequested = CompletableFuture.supplyAsync(() -> {
Pod podCreationModel = createModelNode(podDefinition, podTemplateProperties, Pod.class);
processPod(
podCreationModel,
environmentCreateRequest.isSidecarEnabled(),
environmentCreateRequest.isSidecarArchiveEnabled());
return openShiftClient.pods().create(podCreationModel);
}, executor);
CompletableFuture serviceRequested = CompletableFuture.supplyAsync(() -> {
Service serviceCreationModel = createModelNode(
configuration.getServiceDefinition(),
podTemplateProperties,
Service.class);
return openShiftClient.services().create(serviceCreationModel);
}, executor);
// Handle the exceptions thrown by requests (e.g. KubernetesClientException for exceeded quota)
CompletableFuture requestFailure = new CompletableFuture<>();
podRequested.exceptionally(ex -> {
requestFailure.completeExceptionally(ex);
return null;
});
serviceRequested.exceptionally(ex -> {
requestFailure.completeExceptionally(ex);
return null;
});
// Do not wait for both podRequested and serviceRequested completion, one exception
// of them is enough to stop waiting for the other
CompletableFuture podAndServiceRequested = CompletableFuture.allOf(podRequested, serviceRequested);
return CompletableFuture.anyOf(requestFailure, podAndServiceRequested).thenApplyAsync((nul) -> {
startMonitor(
serviceName,
buildAgentContextPath,
podName,
environmentCreateRequest.getCompletionCallback(),
sshPassword);
return new EnvironmentCreateResponse(environmentId, getCancelRequest(environmentId));
}, executor).exceptionally(throwable -> {
Throwable rootCause = getRootCause(throwable);
logger.error("Exception thrown with message: {}", throwable, rootCause);
gaugeMetric.ifPresent(g -> g.incrementMetric(METRICS_POD_STARTED_FAILED_KEY));
if (rootCause != null && rootCause instanceof MismatchedInputException) {
userLogger.error(ERROR_MESSAGE_INTRO + ERROR_MESSAGE_TEMPLATE_PARSE);
throw new BadResourcesRequestException(ERROR_MESSAGE_INTRO + ERROR_MESSAGE_TEMPLATE_PARSE, rootCause);
} else if (rootCause != null && rootCause instanceof KubernetesClientException
&& rootCause.getMessage().contains("exceeded quota")) {
userLogger.error(ERROR_MESSAGE_INTRO + ERROR_MESSAGE_EXCEEDED_QUOTA);
throw new QuotaExceededException(ERROR_MESSAGE_INTRO + ERROR_MESSAGE_EXCEEDED_QUOTA, rootCause);
}
userLogger.error(rootCause != null ? rootCause.getMessage() : throwable.getMessage());
throw new UnableToRequestResourcesException(
rootCause != null ? rootCause.getMessage() : throwable.getMessage());
});
}
private void processPod(Pod pod, boolean sidecarEnabled, boolean sidecarArchiveEnabled) {
if (!sidecarEnabled) {
List containers = pod.getSpec().getContainers();
Optional sidecarContainer = containers.stream()
.filter(c -> "sidecar".equals(c.getName()))
.findFirst();
if (!sidecarContainer.isEmpty()) {
containers.remove(sidecarContainer.get());
}
}
if (!sidecarArchiveEnabled) {
List initContainers = pod.getSpec().getInitContainers();
Optional sidecarInitContainer = initContainers.stream()
.filter(c -> "sidecar-init".equals(c.getName()))
.findFirst();
if (!sidecarInitContainer.isEmpty()) {
initContainers.remove(sidecarInitContainer.get());
}
}
}
private Request getCancelRequest(String environmentId) {
return Request.builder()
.method(Request.Method.PUT)
.uri(
URI.create(
Strings.stripEndingSlash(configuration.getThisServiceBaseUrl()) + "/cancel/"
+ environmentId))
.headers(getHeaders())
.build();
}
@WithSpan()
public CompletionStage enableDebug(
@SpanAttribute(value = "environmentId") String environmentId) {
Map serviceTemplateProperties = new HashMap<>();
serviceTemplateProperties.put("pod-name", getPodName(environmentId));
serviceTemplateProperties.put("ssh-service-name", getSshServiceName(environmentId));
Map routeTemplateProperties = new HashMap<>();
routeTemplateProperties.put("route-name", getRouteName(environmentId));
routeTemplateProperties.put("route-path", getBuildAgentContextPath(environmentId));
routeTemplateProperties.put("service-name", getServiceName(environmentId));
routeTemplateProperties.put("build-agent-host", configuration.getBuildAgentHost());
// Enable ssh forwarding and complete with the port to which ssh is forwarded
return CompletableFuture.supplyAsync(() -> {
Service serviceCreationModel = createModelNode(
configuration.getSshServiceDefinition(),
serviceTemplateProperties,
Service.class);
Service sshService = openShiftClient.services().create(serviceCreationModel);
return sshService.getSpec()
.getPorts()
.stream()
.filter(m -> m.getName().equals(configuration.getSshServicePortName()))
.findAny()
.orElseThrow(() -> new RuntimeException("No ssh service in response! Service data: " + sshService))
.getNodePort();
}, executor).thenApplyAsync(sshPort -> {
Route routeCreationModel = createModelNode(
configuration.getRouteDefinition(),
routeTemplateProperties,
Route.class);
Route route = openShiftClient.routes().create(routeCreationModel);
String sshHost = route.getSpec().getHost();
return new InetSocketAddress(sshHost, sshPort);
}, executor)
.thenComposeAsync(this::pingSsh)
.thenApplyAsync(
socketAddr -> new EnvironmentCompleteResponse(socketAddr.getHostName(), socketAddr.getPort()));
}
private CompletableFuture pingSsh(InetSocketAddress inetSocketAddress) {
RetryPolicy retryPolicy = new RetryPolicy()
.withMaxDuration(Duration.ofSeconds(configuration.getSshPingRetryDuration()))
.withMaxRetries(Integer.MAX_VALUE) // retry until maxDuration is reached
.withBackoff(
configuration.getSshPingRetryDelayMsec(),
configuration.getSshPingRetryMaxDelayMsec(),
ChronoUnit.MILLIS) // don't wait too long, the response is waiting
.onSuccess(ctx -> logger.info("SSH ping success."))
.onRetry(
ctx -> logger.warn(
"Ssh ping retry attempt: #{}, last error: [{}]",
ctx.getAttemptCount(),
ctx.getLastFailure()))
.onFailure(ctx -> logger.error("Unable to ping ssh service: {}.", ctx.getFailure().getMessage()))
.onAbort(e -> logger.warn("SSH ping ping aborted: {}.", e.getFailure().getMessage()));
return Failsafe.with(retryPolicy).with(executor).getAsync(() -> {
Socket socket = new Socket();
try {
socket.connect(inetSocketAddress, configuration.getHttpClientConnectTimeout() * 1000);
logger.info("Ssh-ping success.");
} finally {
try {
socket.close();
} catch (IOException e) {
logger.warn("Failed to clean-up after ssh-ping.");
}
}
return inetSocketAddress;
});
}
/**
* Method will try its best to destroy the environment. Fire and forget operation.
*/
@WithSpan()
public void destroy(@SpanAttribute(value = "environmentId") String environmentId) {
String podName = getPodName(environmentId);
String serviceName = getServiceName(environmentId);
activeMonitors.cancel(podName);
// delete service
RetryPolicy serviceRetryPolicy = getDestroyRetryPolicy(serviceName);
Failsafe.with(serviceRetryPolicy).with(executor).runAsync(() -> {
Service service = openShiftClient.services().withName(serviceName).get();
if (service != null) {
openShiftClient.services().delete(service);
logger.info("Service {} deleted.", serviceName);
} else {
logger.warn("Service {} does not exists.", serviceName);
}
});
// delete pod
RetryPolicy podRetryPolicy = getDestroyRetryPolicy(podName);
Failsafe.with(podRetryPolicy).with(executor).runAsync(() -> {
Pod pod = openShiftClient.pods().withName(podName).get();
if (pod != null) {
openShiftClient.pods().delete(pod);
logger.info("Pod {} deleted.", podName);
} else {
logger.warn("Pod {} does not exists.", podName);
}
});
}
private RetryPolicy getDestroyRetryPolicy(String resourceName) {
return new RetryPolicy().withMaxDuration(Duration.ofSeconds(configuration.getDestroyRetryDuration()))
.withMaxRetries(Integer.MAX_VALUE) // retry until maxDuration is reached
.withBackoff(
configuration.getDestroyRetryDelayMsec(),
configuration.getDestroyRetryMaxDelayMsec(),
ChronoUnit.MILLIS)
.onSuccess(ctx -> logger.info("Destroy {} success.", resourceName))
.onRetry(
ctx -> logger.warn(
"Destroy {} retry attempt #{}, last error: [{}].",
resourceName,
ctx.getAttemptCount(),
ctx.getLastFailure().getMessage()))
.onFailure(
ctx -> logger.error("Unable to destroy {}: {}.", resourceName, ctx.getFailure().getMessage()))
.onAbort(e -> logger.warn("Destroy {} aborted: {}.", resourceName, e.getFailure().getMessage()));
}
/**
* Method will try its best to destroy all the environments created from given environmentName. Fire and forget
* operation.
*/
@WithSpan()
public void destroyAll(@SpanAttribute(value = "environmentLabel") String environmentLabel) {
// delete service
RetryPolicy servicesRetryPolicy = getDestroyRetryPolicy("services-of-" + environmentLabel);
Failsafe.with(servicesRetryPolicy).with(executor).runAsync(() -> {
ServiceList serviceList = openShiftClient.services().withLabel("environment", environmentLabel).list();
if (serviceList != null) {
openShiftClient.services().delete(serviceList.getItems());
logger.info("Services {} deleted.", getItemNames(serviceList.getItems()));
} else {
logger.warn("No services found by label {}.", environmentLabel);
}
});
// delete pod
RetryPolicy podsRetryPolicy = getDestroyRetryPolicy("pods-of-" + environmentLabel);
Failsafe.with(podsRetryPolicy).with(executor).runAsync(() -> {
PodList podList = openShiftClient.pods().withLabel("environment", environmentLabel).list();
if (podList != null) {
openShiftClient.pods().delete(podList.getItems());
logger.info("Pods {} deleted.", getItemNames(podList.getItems()));
} else {
logger.warn("No pods found by label {}.", environmentLabel);
}
});
}
private void startMonitor(
String serviceName,
String buildAgentContextPath,
String podName,
Request completionCallback,
String sshPassword) {
lifecycle.addActiveOperation();
CompletableFuture serviceRunning = isServiceRunning(serviceName);
activeMonitors.add(podName, serviceRunning);
CompletableFuture podRunning = isPodRunning(podName);
activeMonitors.add(podName, podRunning);
// Do not wait for both podRunning and serviceRunning completion, one exception
// of them is enough to stop
// waiting for the other
CompletableFuture failure = new CompletableFuture<>();
podRunning.exceptionally(ex -> {
failure.completeExceptionally(ex);
return null;
});
serviceRunning.exceptionally(ex -> {
failure.completeExceptionally(ex);
return null;
});
CompletableFuture.anyOf(failure, CompletableFuture.allOf(podRunning, serviceRunning))
.thenComposeAsync((nul) -> {
URI serviceBaseUri = serviceRunning.join(); // future already completed (allOff)
String serviceUriWithContext = serviceBaseUri + buildAgentContextPath;
CompletableFuture> pingFuture = pingBuildAgent(serviceUriWithContext);
activeMonitors.add(podName, pingFuture);
return pingFuture.thenApply((ignoreResponse) -> URI.create(serviceUriWithContext));
})
.handleAsync((serviceUriWithContext, throwable) -> {
logger.info("Completing monitor for pod: {}", podName);
activeMonitors.remove(podName);
CompletableFuture> callback;
if (throwable != null) {
logger.info("Received throwable: {}", throwable);
if (throwable instanceof CancellationException
|| throwable.getCause() instanceof CancellationException) {
callback = callback(completionCallback, EnvironmentCreateResult.cancelled());
} else if (throwable instanceof TemporarilyUnableToStartException
|| throwable.getCause() instanceof TemporarilyUnableToStartException) {
callback = callback(completionCallback, EnvironmentCreateResult.systemError(throwable));
gaugeMetric.ifPresent(g -> g.incrementMetric(METRICS_POD_STARTED_FAILED_KEY));
} else {
callback = callback(completionCallback, EnvironmentCreateResult.failed(throwable));
gaugeMetric.ifPresent(g -> g.incrementMetric(METRICS_POD_STARTED_FAILED_KEY));
}
} else {
EnvironmentCreateResult environmentCreateResult = EnvironmentCreateResult
.success(serviceUriWithContext, configuration.getWorkingDirectory(), sshPassword);
callback = callback(completionCallback, environmentCreateResult);
gaugeMetric.ifPresent(g -> g.incrementMetric(METRICS_POD_STARTED_SUCCESS_KEY));
}
callback.handle((r, t) -> {
if (t != null) {
logger.error("Unable to send completion callback.", t);
}
lifecycle.removeActiveOperation();
return null;
});
return null;
});
}
/**
* Tries to convert the Quantity value into the same units
*
* If the value unit is blank, return the value itself
*
* If the value unit is Gi / Mi, return the value in Gi unit
*
* If the value unit is m, return the original value / 1024
*
* @param quantity quantity to convert
* @return converted unit
* @throws RuntimeException if it does not know how to convert the unit
*/
private static double convertQuantity(Quantity quantity) {
String unit = quantity.getFormat();
double value = Double.parseDouble(quantity.getAmount());
switch (unit) {
case "":
case "Gi":
return value;
case "Mi":
case "m":
return value / 1024;
default:
throw new RuntimeException("Don't know how to convert the unit " + quantity);
}
}
private String getPodRequestedVsAvailableResourcesInfo(String podName) {
String msg = "\n";
try {
double memoryUsedByPod = calculateResourceUsedByPod(podName, "memory");
double cpuUsedByPod = calculateResourceUsedByPod(podName, "cpu");
msg += String
.format("Pod %s requested resources: %.2fGB %.1fcpu; ", podName, memoryUsedByPod, cpuUsedByPod);
} catch (KubernetesClientException kEx) {
logger.warn("Cannot calculate Pod {} requested resources", podName, kEx);
} catch (RuntimeException rEx) {
logger.warn("Cannot convert Pod {} requested resources", podName, rEx);
}
try {
double availableMemory = calculateAvailableResource("memory");
double availableCpu = calculateAvailableResource("cpu");
msg += String.format("Available resources: %.2fGB %.1fcpu", availableMemory, availableCpu);
} catch (KubernetesClientException kEx) {
logger.warn("Cannot calculate available resources in the namespace of Pod {}", podName, kEx);
} catch (RuntimeException rEx) {
logger.warn("Cannot convert available resources in the namespace of Pod {}", podName, rEx);
} catch (Exception ex) {
logger.warn("Error while getting available resources in the namespace of Pod {}", podName, ex);
}
return msg;
}
private double calculateResourceUsedByPod(String podName, String resourceType) {
return openShiftClient.top()
.pods()
.metrics(openShiftClient.pods().withName(podName).get().getMetadata().getNamespace(), podName)
.getContainers()
.stream()
.mapToDouble(containerMetrics -> convertQuantity(containerMetrics.getUsage().get(resourceType)))
.sum();
}
private double calculateAvailableResource(String resourceType) {
String identifier = "limits." + resourceType;
return openShiftClient.resourceQuotas()
.list()
.getItems()
.stream()
.findFirst() // Only consider the first quota,
// we generally only use one!
.map(
resourceQuota -> convertQuantity(resourceQuota.getStatus().getHard().get(identifier))
- convertQuantity(resourceQuota.getStatus().getUsed().get(identifier)))
.orElse(0.0);
}
/**
* Check if pod is in running state. If pod is in one of the failure statuses (as specified in POD_FAILED_STATUSES,
* {@link CompletableFuture} is completed with {@link UnableToStartException}
*
* @return boolean: is pod running?
*/
private CompletableFuture isPodRunning(String podName) {
RetryPolicy retryPolicy = new RetryPolicy()
.withMaxDuration(Duration.ofSeconds(configuration.getPodRunningWaitFor()))
.withMaxRetries(Integer.MAX_VALUE) // retry until maxDuration is reached
.withBackoff(
configuration.getPodRunningRetryDelayMsec(),
configuration.getPodRunningRetryMaxDelayMsec(),
ChronoUnit.MILLIS)
.abortOn(UnableToStartException.class, TemporarilyUnableToStartException.class)
.onSuccess(ctx -> userLogger.info("Pod {} is running.", podName))
.onRetry(
// Received the DriverException
ctx -> userLogger.warn(
"Running pod {} check #{}; last checked status: [{}].",
podName,
ctx.getAttemptCount(),
ctx.getLastFailure() != null ? ctx.getLastFailure().getMessage() : ""))
.onRetriesExceeded(ctx -> {
String errMsg = String.format(
"Unable to start pod %s. %s",
podName,
(ctx.getFailure() != null ? ctx.getFailure().getMessage() : ""));
errMsg += ERROR_MESSAGE_INTRO + ERROR_MESSAGE_TIMEOUT
+ getPodRequestedVsAvailableResourcesInfo(podName);
userLogger.warn(errMsg);
throw new UnableToStartException(errMsg);
})
.onFailure(
ctx -> userLogger.error(
"Unable to start pod {}. {}",
podName,
(ctx.getFailure() != null ? ctx.getFailure().getMessage() : "")))
.onAbort(
// Received the UnableToStartException
e -> logger.info(
"IsPodRunning for pod {} was aborted. {}",
podName,
(e.getFailure() != null ? e.getFailure().getMessage() : "")));
return Failsafe.with(retryPolicy).with(executor).runAsync(() -> {
Pod pod = openShiftClient.pods().withName(podName).get();
String podStatus = pod.getStatus().getPhase();
// Get all the termination or waiting reasons for the containers inside the Pod
Set containerStatuses = new HashSet<>();
if (pod.getStatus().getInitContainerStatuses() != null) {
for (ContainerStatus containerStatus : pod.getStatus().getInitContainerStatuses()) {
if (containerStatus.getState() != null) {
if (containerStatus.getState().getTerminated() != null) {
containerStatuses.add(containerStatus.getState().getTerminated().getReason());
}
if (containerStatus.getState().getWaiting() != null) {
containerStatuses.add(containerStatus.getState().getWaiting().getReason());
}
}
}
}
if (pod.getStatus().getContainerStatuses() != null) {
for (ContainerStatus containerStatus : pod.getStatus().getContainerStatuses()) {
if (containerStatus.getState() != null) {
if (containerStatus.getState().getTerminated() != null) {
containerStatuses.add(containerStatus.getState().getTerminated().getReason());
}
if (containerStatus.getState().getWaiting() != null) {
containerStatuses.add(containerStatus.getState().getWaiting().getReason());
}
}
}
}
logger.info("Pod {} status: {} containersStatusesReasons: {}", podName, podStatus, containerStatuses);
// If the pod final status OR any status of all the containers in the pod are among the failed statuses,
// abort isPodRunning
Optional failedPodStatus = PodErrorStatuses.getIfPresent(podStatus);
Optional failedContainerStatus = PodErrorStatuses.getIfPresent(containerStatuses);
if (failedPodStatus.isPresent() || failedContainerStatus.isPresent()) {
// The status was among the failed statuses
PodErrorStatuses detectedPodErrorStatus = failedPodStatus.isPresent() ? failedPodStatus.get()
: failedContainerStatus.get();
gaugeMetric.ifPresent(
g -> g.incrementMetric(
METRICS_POD_STARTED_FAILED_REASON_KEY + "." + detectedPodErrorStatus.getStatus()));
if (!detectedPodErrorStatus.isRetryable()) {
throw new UnableToStartException(
"Pod failed with status: " + detectedPodErrorStatus.getStatus()
+ detectedPodErrorStatus.getCustomErrMsg());
} else {
throw new TemporarilyUnableToStartException(
"Pod failed with status: " + detectedPodErrorStatus.getStatus()
+ detectedPodErrorStatus.getCustomErrMsg());
}
}
boolean isRunning = "Running".equals(podStatus);
if (!isRunning) {
throw new DriverException("Pod " + podName + " is not running.");
}
});
}
private CompletableFuture isServiceRunning(String serviceName) {
RetryPolicy
© 2015 - 2025 Weber Informatics LLC | Privacy Policy