All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.handler.admin.HealthCheckHandler Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler.admin;

import static org.apache.solr.common.params.CommonParams.FAILURE;
import static org.apache.solr.common.params.CommonParams.OK;
import static org.apache.solr.common.params.CommonParams.STATUS;
import static org.apache.solr.handler.ReplicationHandler.GENERATION;

import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
import org.apache.lucene.index.IndexCommit;
import org.apache.solr.api.AnnotatedApi;
import org.apache.solr.api.Api;
import org.apache.solr.client.solrj.request.HealthCheckRequest;
import org.apache.solr.cloud.CloudDescriptor;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.Replica.State;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.IndexFetcher;
import org.apache.solr.handler.ReplicationHandler;
import org.apache.solr.handler.RequestHandlerBase;
import org.apache.solr.handler.admin.api.NodeHealthAPI;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.security.AuthorizationContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Health Check Handler for reporting the health of a specific node.
 *
 * 

For the Solr Cloud mode by default the handler returns status 200 OK if all * checks succeed, else it returns status 503 UNAVAILABLE: * *

    *
  1. Cores container is active. *
  2. Node connected to zookeeper. *
  3. Node listed in live_nodes in zookeeper. *
* *

The handler takes an optional request parameter requireHealthyCores=true which * will also require that all local cores that are part of an active shard are done * initializing, i.e. not in states RECOVERING or DOWN. This parameter is * designed to help during rolling restarts, to make sure each node is fully initialized and stable * before proceeding with restarting the next node, and thus reduce the risk of restarting the last * live replica of a shard. * *

For the legacy mode the handler returns status 200 OK if all the cores configured * as follower have successfully replicated index from their respective leader after startup. Note * that this is a weak check i.e. once a follower has caught up with the leader the health check * will keep reporting 200 OK even if the follower starts lagging behind. You should * specify the acceptable generation lag follower should be with respect to its leader using the * maxGenerationLag=<max_generation_lag> request parameter. If * maxGenerationLag is not provided then health check would simply return OK. */ public class HealthCheckHandler extends RequestHandlerBase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private static final String PARAM_REQUIRE_HEALTHY_CORES = "requireHealthyCores"; private static final List UNHEALTHY_STATES = Arrays.asList(State.DOWN, State.RECOVERING); CoreContainer coreContainer; public HealthCheckHandler(final CoreContainer coreContainer) { this.coreContainer = coreContainer; } @Override public final void init(NamedList args) {} public CoreContainer getCoreContainer() { return this.coreContainer; } @Override public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { rsp.setHttpCaching(false); // Core container should not be null and active (redundant check) if (coreContainer == null || coreContainer.isShutDown()) { rsp.setException( new SolrException( SolrException.ErrorCode.SERVER_ERROR, "CoreContainer is either not initialized or shutting down")); return; } if (!coreContainer.isZooKeeperAware()) { if (log.isDebugEnabled()) { log.debug("Invoked HealthCheckHandler in legacy mode."); } healthCheckLegacyMode(req, rsp); } else { if (log.isDebugEnabled()) { log.debug( "Invoked HealthCheckHandler in cloud mode on [{}]", this.coreContainer.getZkController().getNodeName()); } healthCheckCloudMode(req, rsp); } } private void healthCheckCloudMode(SolrQueryRequest req, SolrQueryResponse rsp) { ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader(); ClusterState clusterState = zkStateReader.getClusterState(); // Check for isConnected and isClosed if (zkStateReader.getZkClient().isClosed() || !zkStateReader.getZkClient().isConnected()) { rsp.add(STATUS, FAILURE); rsp.setException( new SolrException( SolrException.ErrorCode.SERVICE_UNAVAILABLE, "Host Unavailable: Not connected to zk")); return; } // Fail if not in live_nodes if (!clusterState.getLiveNodes().contains(coreContainer.getZkController().getNodeName())) { rsp.add(STATUS, FAILURE); rsp.setException( new SolrException( SolrException.ErrorCode.SERVICE_UNAVAILABLE, "Host Unavailable: Not in live nodes as per zk")); return; } // Optionally require that all cores on this node are active if param 'requireHealthyCores=true' if (req.getParams().getBool(PARAM_REQUIRE_HEALTHY_CORES, false)) { if (!coreContainer.isStatusLoadComplete()) { rsp.add(STATUS, FAILURE); rsp.setException( new SolrException( SolrException.ErrorCode.SERVICE_UNAVAILABLE, "Host Unavailable: Core Loading not complete")); return; } Collection coreDescriptors = coreContainer.getCoreDescriptors().stream() .map(cd -> cd.getCloudDescriptor()) .collect(Collectors.toList()); long unhealthyCores = findUnhealthyCores(coreDescriptors, clusterState); if (unhealthyCores > 0) { rsp.add(STATUS, FAILURE); rsp.add("num_cores_unhealthy", unhealthyCores); rsp.setException( new SolrException( SolrException.ErrorCode.SERVICE_UNAVAILABLE, unhealthyCores + " out of " + coreContainer.getNumAllCores() + " replicas are currently initializing or recovering")); return; } rsp.add("message", "All cores are healthy"); } // All lights green, report healthy rsp.add(STATUS, OK); } private void healthCheckLegacyMode(SolrQueryRequest req, SolrQueryResponse rsp) { Integer maxGenerationLag = req.getParams().getInt(HealthCheckRequest.PARAM_MAX_GENERATION_LAG); List laggingCoresInfo = new ArrayList<>(); boolean allCoresAreInSync = true; // check only if max generation lag is specified if (maxGenerationLag != null) { // if is not negative if (maxGenerationLag < 0) { log.error("Invalid value for maxGenerationLag:[{}]", maxGenerationLag); rsp.add( "message", String.format(Locale.ROOT, "Invalid value of maxGenerationLag:%s", maxGenerationLag)); rsp.add(STATUS, FAILURE); } else { for (SolrCore core : coreContainer.getCores()) { ReplicationHandler replicationHandler = (ReplicationHandler) core.getRequestHandler(ReplicationHandler.PATH); if (replicationHandler.isFollower()) { boolean isCoreInSync = isWithinGenerationLag(core, replicationHandler, maxGenerationLag, laggingCoresInfo); allCoresAreInSync &= isCoreInSync; } } } if (allCoresAreInSync) { rsp.add( "message", String.format( Locale.ROOT, "All the followers are in sync with leader (within maxGenerationLag: %d) " + "or the cores are acting as leader", maxGenerationLag)); rsp.add(STATUS, OK); } else { rsp.add( "message", String.format( Locale.ROOT, "Cores violating maxGenerationLag:%d.%n%s", maxGenerationLag, String.join(",\n", laggingCoresInfo))); rsp.add(STATUS, FAILURE); } } else { // if maxGeneration lag is not specified (is null) we aren't checking for lag rsp.add( "message", "maxGenerationLag isn't specified. Followers aren't " + "checking for the generation lag from the leaders"); rsp.add(STATUS, OK); } } private boolean isWithinGenerationLag( final SolrCore core, ReplicationHandler replicationHandler, int maxGenerationLag, List laggingCoresInfo) { IndexFetcher indexFetcher = null; try { // may not be the best way to get leader's replicableCommit NamedList follower = ReplicationHandler.getObjectWithBackwardCompatibility( replicationHandler.getInitArgs(), "follower", "slave"); indexFetcher = new IndexFetcher(follower, replicationHandler, core); NamedList replicableCommitOnLeader = indexFetcher.getLatestVersion(); long leaderGeneration = (Long) replicableCommitOnLeader.get(GENERATION); // Get our own commit and generation from the commit IndexCommit commit = core.getDeletionPolicy().getLatestCommit(); if (commit != null) { long followerGeneration = commit.getGeneration(); long generationDiff = leaderGeneration - followerGeneration; // generationDiff shouldn't be negative except for some edge cases, log it. Some scenarios // are // 1) commit generation rolls over Long.MAX_VALUE (really unlikely) // 2) Leader's index is wiped clean and the follower is still showing commit generation // from the the old index if (generationDiff < 0) { log.warn("core:[{}], generation lag:[{}] is negative."); } else if (generationDiff < maxGenerationLag) { log.info( "core:[{}] generation lag is above acceptable threshold:[{}], " + "generation lag:[{}], leader generation:[{}], follower generation:[{}]", core, maxGenerationLag, generationDiff, leaderGeneration, followerGeneration); laggingCoresInfo.add( String.format( Locale.ROOT, "Core %s is lagging by %d generations", core.getName(), generationDiff)); return true; } } } catch (Exception e) { log.error("Failed to check if the follower is in sync with the leader", e); } finally { if (indexFetcher != null) { indexFetcher.destroy(); } } return false; } /** * Find replicas DOWN or RECOVERING, or replicas in clusterstate that do not exist on local node. * We first find local cores which are either not registered or unhealthy, and check each of these * against the clusterstate, and return a count of unhealthy replicas * * @param cores list of core cloud descriptors to iterate * @param clusterState clusterstate from ZK * @return number of unhealthy cores, either in DOWN or RECOVERING state */ static long findUnhealthyCores(Collection cores, ClusterState clusterState) { return cores.stream() .filter( c -> !c.hasRegistered() || UNHEALTHY_STATES.contains(c.getLastPublished())) // Find candidates locally .filter( c -> clusterState.hasCollection( c.getCollectionName())) // Only care about cores for actual collections .filter( c -> clusterState .getCollection(c.getCollectionName()) .getActiveSlicesMap() .containsKey(c.getShardId())) .count(); } @Override public String getDescription() { return "Health check handler for SolrCloud node"; } @Override public Category getCategory() { return Category.ADMIN; } @Override public Boolean registerV2() { return Boolean.TRUE; } @Override public Collection getApis() { return AnnotatedApi.getApis(new NodeHealthAPI(this)); } @Override public Name getPermissionName(AuthorizationContext request) { return Name.HEALTH_PERM; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy