All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.handler.admin.RebalanceLeaders Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.admin;

import static org.apache.solr.cloud.Overseer.QUEUE_OPERATION;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.CORE_NODE_NAME_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.ELECTION_NODE_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.LEADER_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.MAX_AT_ONCE_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.MAX_WAIT_SECONDS_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.REJOIN_AT_HEAD_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.REBALANCELEADERS;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;

import java.lang.invoke.MethodHandles;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.solr.cloud.LeaderElector;
import org.apache.solr.cloud.OverseerTaskProcessor;
import org.apache.solr.cloud.overseer.SliceMutator;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * The end point for the collections API REBALANCELEADERS call that actually does the work.
 *
 * 

Overview: * *

The leader election process is that each replica of a shard watches one, and only one other * replica via ephemeral nodes in ZooKeeper. When the node being watched goes down, the node * watching it is sent a notification and, if the node being watched is the leader, the node getting * the notification assumes leadership. * *

ZooKeeper's ephemeral nodes get a monotonically increasing "sequence number" that defines it's * position in the queue * *

So to force a particular node to become a leader it must have a watch on the leader. This can * lead to two nodes having the same sequence number. Say the process is this replica1 is the leader * (seq 1) replica3 is on a Solr node that happens to be started next, it watches the leader (seq2) * replica2 is on the next Solr node started. It will _also_ watch the leader, it's sequence number * is 2 exactly like replica3s * *

This is true on startup, but can also be a consequence of, say, a replica going into recovery. * It's no longer eligible to become leader, so will be put at the end of the queue by default. So * there's code to put it in the queue with the same sequence number as the current second replica. * *

To compilcate matters further, when the nodes are sorted (see * OverseerTaskProcessor.getSortedElectionNodes) the primary sort is on the sequence number, * secondary sort on the session ID. So the preferredLeader may or may not be second in that list. * *

what all this means is that when the REBALANCELEADER command is issued, this class examines * the election queue and performs just three things for each shard in the collection: * *

1> insures that the preferredLeader is watching the leader (rejoins the election queue at the * head) * *

2> if there are two ephemeral nodes with the same sequence number watching the leader, and if * one of them is the preferredLeader it will send the _other_ node to the end of the queue (rejoins * it) * *

3> rejoins the zeroth entry in the list at the end of the queue, which triggers the watch on * the preferredLeader replica which then takes over leadership * *

All this of course assuming the preferedLeader is alive and well and is assigned for any given * shard. */ class RebalanceLeaders { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); final SolrQueryRequest req; final SolrQueryResponse rsp; final CollectionsHandler collectionsHandler; final CoreContainer coreContainer; private final Set asyncRequests = new HashSet<>(); static final String INACTIVE_PREFERREDS = "inactivePreferreds"; static final String ALREADY_LEADERS = "alreadyLeaders"; static final String SUMMARY = "Summary"; final SimpleOrderedMap>> results = new SimpleOrderedMap<>(); final Map pendingOps = new HashMap<>(); private String collectionName; RebalanceLeaders( SolrQueryRequest req, SolrQueryResponse rsp, CollectionsHandler collectionsHandler) { this.req = req; this.rsp = rsp; this.collectionsHandler = collectionsHandler; coreContainer = collectionsHandler.getCoreContainer(); } void execute() throws KeeperException, InterruptedException { DocCollection dc = checkParams(); int max = req.getParams().getInt(MAX_AT_ONCE_PROP, Integer.MAX_VALUE); if (max <= 0) max = Integer.MAX_VALUE; int maxWaitSecs = req.getParams().getInt(MAX_WAIT_SECONDS_PROP, 60); // If there are a maximum number of simultaneous requests specified, we have to pause when we // have that many outstanding requests and wait for at least one to finish before going on the // the next rebalance. boolean keepGoing = true; for (Slice slice : dc.getSlices()) { ensurePreferredIsLeader(slice); if (asyncRequests.size() == max) { log.info("Queued {} leader reassignments, waiting for some to complete.", max); keepGoing = waitAsyncRequests(maxWaitSecs, false); if (keepGoing == false) { break; // If we've waited longer than specified, don't continue to wait! } } } if (keepGoing == true) { keepGoing = waitAsyncRequests(maxWaitSecs, true); } if (keepGoing == true) { log.info("All leader reassignments completed."); } else { log.warn( "Exceeded specified timeout of '{}' all leaders may not have been reassigned'", maxWaitSecs); } checkLeaderStatus(); SimpleOrderedMap summary = new SimpleOrderedMap<>(); if (pendingOps.size() == 0) { summary.add( "Success", "All active replicas with the preferredLeader property set are leaders"); } else { summary.add("Failure", "Not all active replicas with preferredLeader property are leaders"); } rsp.getValues().add(SUMMARY, summary); // we want this first. rsp.getValues().addAll(results); } // Insure that ll required parameters are there and the doc colection exists. private DocCollection checkParams() throws KeeperException, InterruptedException { req.getParams().required().check(COLLECTION_PROP); collectionName = req.getParams().get(COLLECTION_PROP); if (StrUtils.isBlank(collectionName)) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, String.format( Locale.ROOT, "The " + COLLECTION_PROP + " is required for the Rebalance Leaders command.")); } coreContainer.getZkController().getZkStateReader().forceUpdateCollection(collectionName); ClusterState clusterState = coreContainer.getZkController().getClusterState(); DocCollection dc = clusterState.getCollection(collectionName); if (dc == null) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Collection '" + collectionName + "' does not exist, no action taken."); } return dc; } // Once we've done all the fiddling with the queues, check on the way out to see if all the active // preferred leaders that we intended to change are in fact the leaders. private void checkLeaderStatus() throws InterruptedException, KeeperException { for (int idx = 0; pendingOps.size() > 0 && idx < 600; ++idx) { ClusterState clusterState = coreContainer.getZkController().getClusterState(); Set liveNodes = clusterState.getLiveNodes(); DocCollection dc = clusterState.getCollection(collectionName); for (Slice slice : dc.getSlices()) { for (Replica replica : slice.getReplicas()) { if (replica.isActive(liveNodes) && replica.getBool(SliceMutator.PREFERRED_LEADER_PROP, false)) { if (replica.getBool(LEADER_PROP, false)) { if (pendingOps.containsKey(slice.getName())) { // Record for return that the leader changed successfully pendingOps.remove(slice.getName()); addToSuccesses(slice, replica); break; } } } } } TimeUnit.MILLISECONDS.sleep(100); coreContainer.getZkController().getZkStateReader().forciblyRefreshAllClusterStateSlow(); } addAnyFailures(); } // The process is: // if the replica with preferredLeader is already the leader, do nothing // Otherwise: // > if two nodes have the same sequence number and both point to the current leader, we presume // that we've just moved it, move the one that does _not_ have the preferredLeader to the end of // the list. // > move the current leader to the end of the list. This _should_ mean that the current ephemeral // node in the leader election queue is removed and the only remaining node watching it is // triggered to become leader. private void ensurePreferredIsLeader(Slice slice) throws KeeperException, InterruptedException { for (Replica replica : slice.getReplicas()) { // Tell the replica to become the leader if we're the preferred leader AND active AND not the // leader already if (replica.getBool(SliceMutator.PREFERRED_LEADER_PROP, false) == false) { continue; } // OK, we are the preferred leader, are we the actual leader? if (replica.getBool(LEADER_PROP, false)) { // We're a preferred leader, but we're _also_ the leader, don't need to do anything. addAlreadyLeaderToResults(slice, replica); return; // already the leader, do nothing. } ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader(); // We're the preferred leader, but someone else is leader. Only become leader if we're active. if (replica.isActive(zkStateReader.getClusterState().getLiveNodes()) == false) { addInactiveToResults(slice, replica); return; // Don't try to become the leader if we're not active! } List electionNodes = OverseerTaskProcessor.getSortedElectionNodes( zkStateReader.getZkClient(), ZkStateReader.getShardLeadersElectPath(collectionName, slice.getName())); if (electionQueueInBadState(electionNodes, slice, replica)) { return; } // Replica is the preferred leader but not the actual leader, do something about that. // "Something" is // 1> if the preferred leader isn't first in line, tell it to re-queue itself. // 2> tell the actual leader to re-queue itself. // Ok, the sorting for election nodes is a bit strange. If the sequence numbers are the same, // then the whole string is used, but that sorts nodes with the same sequence number by their // session IDs from ZK. While this is determinate, it's not quite what we need, so re-queue // nodes that aren't us and are watching the leader node. String firstWatcher = electionNodes.get(1); if (LeaderElector.getNodeName(firstWatcher).equals(replica.getName()) == false) { makeReplicaFirstWatcher(slice, replica); } // This replica should be the leader at the end of the day, so let's record that information // to check at the end pendingOps.put(slice.getName(), replica.getName()); String leaderElectionNode = electionNodes.get(0); String coreName = slice.getReplica(LeaderElector.getNodeName(leaderElectionNode)).getStr(CORE_NAME_PROP); rejoinElectionQueue(slice, leaderElectionNode, coreName, false); waitForNodeChange(slice, leaderElectionNode); return; // Done with this slice, skip the rest of the replicas. } } // Check that the election queue has some members! There really should be two or more for this to // make any sense, if there's only one we can't change anything. private boolean electionQueueInBadState( List electionNodes, Slice slice, Replica replica) { // if there's only one node in the queue, should already be leader and we shouldn't be here // anyway. if (electionNodes.size() < 2) { log.warn( "Rebalancing leaders and slice {} has less than two elements in the leader election queue, but replica {} doesn't think it's the leader.", slice.getName(), replica.getName()); return true; } return false; } // Provide some feedback to the user about what actually happened, or in this case where no action // was possible private void addInactiveToResults(Slice slice, Replica replica) { SimpleOrderedMap> inactives = results.get(INACTIVE_PREFERREDS); if (inactives == null) { inactives = new SimpleOrderedMap<>(); results.add(INACTIVE_PREFERREDS, inactives); } SimpleOrderedMap res = new SimpleOrderedMap<>(); res.add("status", "skipped"); res.add( "msg", "Replica " + replica.getName() + " is a referredLeader for shard " + slice.getName() + ", but is inactive. No change necessary"); inactives.add(replica.getName(), res); } // Provide some feedback to the user about what actually happened, or in this case where no action // was necesary since this preferred replica was already the leader private void addAlreadyLeaderToResults(Slice slice, Replica replica) { SimpleOrderedMap> alreadyLeaders = results.get(ALREADY_LEADERS); if (alreadyLeaders == null) { alreadyLeaders = new SimpleOrderedMap<>(); results.add(ALREADY_LEADERS, alreadyLeaders); } SimpleOrderedMap res = new SimpleOrderedMap<>(); res.add("status", "skipped"); res.add( "msg", "Replica " + replica.getName() + " is already the leader for shard " + slice.getName() + ". No change necessary"); alreadyLeaders.add(replica.getName(), res); } // Put the replica in at the head of the queue and send all nodes with the same sequence number to // the back of the list. There can be "ties", i.e. replicas in the queue with the same sequence // number. Sorting doesn't necessarily sort the one we most care about first. So put the node we // _don't care about at the end of the election queue_ void makeReplicaFirstWatcher(Slice slice, Replica replica) throws KeeperException, InterruptedException { ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader(); List electionNodes = OverseerTaskProcessor.getSortedElectionNodes( zkStateReader.getZkClient(), ZkStateReader.getShardLeadersElectPath(collectionName, slice.getName())); // First, queue up the preferred leader watching the leader if it isn't already int secondSeq = Integer.MAX_VALUE; int candidateSeq = -1; for (int idx = 1; idx < electionNodes.size(); ++idx) { String candidate = electionNodes.get(idx); secondSeq = Math.min(secondSeq, LeaderElector.getSeq(candidate)); if (LeaderElector.getNodeName(candidate).equals(replica.getName())) { candidateSeq = LeaderElector.getSeq(candidate); } } int newSeq = -1; if (candidateSeq == secondSeq) { // the preferredLeader is already watching the leader, no need to move it around. newSeq = secondSeq; } else { for (String electionNode : electionNodes) { if (LeaderElector.getNodeName(electionNode).equals(replica.getName())) { // Make the preferred leader watch the leader. String coreName = slice.getReplica(LeaderElector.getNodeName(electionNode)).getStr(CORE_NAME_PROP); rejoinElectionQueue(slice, electionNode, coreName, true); newSeq = waitForNodeChange(slice, electionNode); break; } } } if (newSeq == -1) { return; // let's not continue if we didn't get what we expect. Possibly we're offline etc.. } // Now find other nodes that have the same sequence number as this node and re-queue them at the // end of the queue. electionNodes = OverseerTaskProcessor.getSortedElectionNodes( zkStateReader.getZkClient(), ZkStateReader.getShardLeadersElectPath(collectionName, slice.getName())); for (String thisNode : electionNodes) { if (LeaderElector.getSeq(thisNode) > newSeq) { break; } if (LeaderElector.getNodeName(thisNode).equals(replica.getName())) { continue; } // We won't get here for the preferredLeader node if (LeaderElector.getSeq(thisNode) == newSeq) { String coreName = slice.getReplica(LeaderElector.getNodeName(thisNode)).getStr(CORE_NAME_PROP); rejoinElectionQueue(slice, thisNode, coreName, false); waitForNodeChange(slice, thisNode); } } } // We're just waiting for the electionNode to rejoin the queue with a _different_ node, indicating // that any requeueing we've done has happened. int waitForNodeChange(Slice slice, String electionNode) throws InterruptedException, KeeperException { String nodeName = LeaderElector.getNodeName(electionNode); int oldSeq = LeaderElector.getSeq(electionNode); for (int idx = 0; idx < 600; ++idx) { ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader(); List electionNodes = OverseerTaskProcessor.getSortedElectionNodes( zkStateReader.getZkClient(), ZkStateReader.getShardLeadersElectPath(collectionName, slice.getName())); for (String testNode : electionNodes) { if (LeaderElector.getNodeName(testNode).equals(nodeName) && oldSeq != LeaderElector.getSeq(testNode)) { return LeaderElector.getSeq(testNode); } } TimeUnit.MILLISECONDS.sleep(100); zkStateReader.forciblyRefreshAllClusterStateSlow(); } return -1; } // Move an election node to some other place in the queue. If rejoinAtHead==false, then at the // end, otherwise the new node should point at the leader. private void rejoinElectionQueue( Slice slice, String electionNode, String core, boolean rejoinAtHead) throws KeeperException, InterruptedException { Replica replica = slice.getReplica(LeaderElector.getNodeName(electionNode)); final CollectionParams.CollectionAction rebalanceleaders = REBALANCELEADERS; Map propMap = new HashMap<>(); propMap.put(COLLECTION_PROP, collectionName); propMap.put(SHARD_ID_PROP, slice.getName()); propMap.put(QUEUE_OPERATION, rebalanceleaders.toLower()); propMap.put(CORE_NAME_PROP, core); propMap.put(CORE_NODE_NAME_PROP, replica.getName()); propMap.put(ZkStateReader.NODE_NAME_PROP, replica.getNodeName()); propMap.put( ZkStateReader.BASE_URL_PROP, coreContainer .getZkController() .getZkStateReader() .getBaseUrlForNodeName(replica.getNodeName())); propMap.put( REJOIN_AT_HEAD_PROP, Boolean.toString(rejoinAtHead)); // Get ourselves to be first in line. propMap.put(ELECTION_NODE_PROP, electionNode); String asyncId = rebalanceleaders.toLower() + "_" + core + "_" + Math.abs(System.nanoTime()); propMap.put(ASYNC, asyncId); asyncRequests.add(asyncId); collectionsHandler.submitCollectionApiCommand( new ZkNodeProps(propMap), rebalanceleaders); // ignore response; we construct our own } // maxWaitSecs - How long are we going to wait? Defaults to 30 seconds. // waitForAll - if true, do not return until all requests have been processed. "Processed" could // mean failure! // private boolean waitAsyncRequests(final int maxWaitSecs, Boolean waitForAll) throws KeeperException, InterruptedException { if (asyncRequests.size() == 0) { return true; } for (int idx = 0; idx < maxWaitSecs * 10; ++idx) { Iterator iter = asyncRequests.iterator(); boolean foundChange = false; while (iter.hasNext()) { String asyncId = iter.next(); if (coreContainer.getZkController().getOverseerFailureMap().contains(asyncId)) { coreContainer.getZkController().getOverseerFailureMap().remove(asyncId); coreContainer.getZkController().clearAsyncId(asyncId); iter.remove(); foundChange = true; } else if (coreContainer.getZkController().getOverseerCompletedMap().contains(asyncId)) { coreContainer.getZkController().getOverseerCompletedMap().remove(asyncId); coreContainer.getZkController().clearAsyncId(asyncId); iter.remove(); foundChange = true; } } // We're done if we're processing a few at a time or all requests are processed. We don't want // to change, say, 100s of leaders simultaneously. So if the request specifies some limit, and // we're at that limit, we want to return to the caller so it can immediately add another // request. That's the purpose of the first clause here. Otherwise, of course, just return if // all requests are processed. if ((foundChange && waitForAll == false) || asyncRequests.size() == 0) { return true; } TimeUnit.MILLISECONDS.sleep(100); } // If we get here, we've timed out waiting. return false; } // If we actually changed the leader, we should send that fact back in the response. private void addToSuccesses(Slice slice, Replica replica) { SimpleOrderedMap> successes = results.get("successes"); if (successes == null) { successes = new SimpleOrderedMap<>(); results.add("successes", successes); } if (log.isInfoEnabled()) { log.info( "Successfully changed leader of shard {} to replica {}", slice.getName(), replica.getName()); } SimpleOrderedMap res = new SimpleOrderedMap<>(); res.add("status", "success"); res.add( "msg", "Successfully changed leader of slice " + slice.getName() + " to " + replica.getName()); successes.add(slice.getName(), res); } // If for any reason we were supposed to change leadership, that should be recorded in // changingLeaders. Any time we verified that the change actually occurred, that entry should have // been removed. So report anything left over as a failure. private void addAnyFailures() { if (pendingOps.size() == 0) { return; } SimpleOrderedMap> fails = new SimpleOrderedMap<>(); results.add("failures", fails); for (Map.Entry ent : pendingOps.entrySet()) { if (log.isInfoEnabled()) { log.info("Failed to change leader of shard {} to replica {}", ent.getKey(), ent.getValue()); } SimpleOrderedMap res = new SimpleOrderedMap<>(); res.add("status", "failed"); res.add( "msg", String.format( Locale.ROOT, "Could not change leder for slice %s to %s", ent.getKey(), ent.getValue())); fails.add(ent.getKey(), res); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy