org.apache.solr.handler.admin.RebalanceLeaders Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.admin;

import static org.apache.solr.cloud.Overseer.QUEUE_OPERATION;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.CORE_NODE_NAME_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.ELECTION_NODE_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.LEADER_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.MAX_AT_ONCE_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.MAX_WAIT_SECONDS_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.REJOIN_AT_HEAD_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.REBALANCELEADERS;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;

import java.lang.invoke.MethodHandles;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.solr.cloud.LeaderElector;
import org.apache.solr.cloud.OverseerTaskProcessor;
import org.apache.solr.cloud.overseer.SliceMutator;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * The end point for the collections API REBALANCELEADERS call that actually does the work.
 *
 * Overview:
 *
 * 
The leader election process is that each replica of a shard watches one, and only one other
 * replica via ephemeral nodes in ZooKeeper. When the node being watched goes down, the node
 * watching it is sent a notification and, if the node being watched is the leader, the node getting
 * the notification assumes leadership.
 *
 * 
ZooKeeper's ephemeral nodes get a monotonically increasing "sequence number" that defines it's
 * position in the queue
 *
 * 
So to force a particular node to become a leader it must have a watch on the leader. This can
 * lead to two nodes having the same sequence number. Say the process is this replica1 is the leader
 * (seq 1) replica3 is on a Solr node that happens to be started next, it watches the leader (seq2)
 * replica2 is on the next Solr node started. It will _also_ watch the leader, it's sequence number
 * is 2 exactly like replica3s
 *
 * 
This is true on startup, but can also be a consequence of, say, a replica going into recovery.
 * It's no longer eligible to become leader, so will be put at the end of the queue by default. So
 * there's code to put it in the queue with the same sequence number as the current second replica.
 *
 * 
To compilcate matters further, when the nodes are sorted (see
 * OverseerTaskProcessor.getSortedElectionNodes) the primary sort is on the sequence number,
 * secondary sort on the session ID. So the preferredLeader may or may not be second in that list.
 *
 * 
what all this means is that when the REBALANCELEADER command is issued, this class examines
 * the election queue and performs just three things for each shard in the collection:
 *
 * 
1> insures that the preferredLeader is watching the leader (rejoins the election queue at the
 * head)
 *
 * 
2> if there are two ephemeral nodes with the same sequence number watching the leader, and if
 * one of them is the preferredLeader it will send the _other_ node to the end of the queue (rejoins
 * it)
 *
 * 
3> rejoins the zeroth entry in the list at the end of the queue, which triggers the watch on
 * the preferredLeader replica which then takes over leadership
 *
 * All this of course assuming the preferedLeader is alive and well and is assigned for any given
 * shard.
 */
class RebalanceLeaders {
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  final SolrQueryRequest req;
  final SolrQueryResponse rsp;
  final CollectionsHandler collectionsHandler;
  final CoreContainer coreContainer;
  private final Set asyncRequests = new HashSet<>();
  static final String INACTIVE_PREFERREDS = "inactivePreferreds";
  static final String ALREADY_LEADERS = "alreadyLeaders";
  static final String SUMMARY = "Summary";
  final SimpleOrderedMap>> results =
      new SimpleOrderedMap<>();
  final Map pendingOps = new HashMap<>();
  private String collectionName;

  RebalanceLeaders(
      SolrQueryRequest req, SolrQueryResponse rsp, CollectionsHandler collectionsHandler) {
    this.req = req;
    this.rsp = rsp;
    this.collectionsHandler = collectionsHandler;
    coreContainer = collectionsHandler.getCoreContainer();
  }

  void execute() throws KeeperException, InterruptedException {
    DocCollection dc = checkParams();

    int max = req.getParams().getInt(MAX_AT_ONCE_PROP, Integer.MAX_VALUE);
    if (max <= 0) max = Integer.MAX_VALUE;
    int maxWaitSecs = req.getParams().getInt(MAX_WAIT_SECONDS_PROP, 60);

    // If there are a maximum number of simultaneous requests specified, we have to pause when we
    // have that many outstanding requests and wait for at least one to finish before going on the
    // the next rebalance.
    boolean keepGoing = true;
    for (Slice slice : dc.getSlices()) {
      ensurePreferredIsLeader(slice);
      if (asyncRequests.size() == max) {
        log.info("Queued {} leader reassignments, waiting for some to complete.", max);
        keepGoing = waitAsyncRequests(maxWaitSecs, false);
        if (keepGoing == false) {
          break; // If we've waited longer than specified, don't continue to wait!
        }
      }
    }
    if (keepGoing == true) {
      keepGoing = waitAsyncRequests(maxWaitSecs, true);
    }
    if (keepGoing == true) {
      log.info("All leader reassignments completed.");
    } else {
      log.warn(
          "Exceeded specified timeout of '{}' all leaders may not have been reassigned'",
          maxWaitSecs);
    }

    checkLeaderStatus();
    SimpleOrderedMap summary = new SimpleOrderedMap<>();
    if (pendingOps.size() == 0) {
      summary.add(
          "Success", "All active replicas with the preferredLeader property set are leaders");
    } else {
      summary.add("Failure", "Not all active replicas with preferredLeader property are leaders");
    }
    rsp.getValues().add(SUMMARY, summary); // we want this first.

    rsp.getValues().addAll(results);
  }

  // Insure that ll required parameters are there and the doc colection exists.
  private DocCollection checkParams() throws KeeperException, InterruptedException {
    req.getParams().required().check(COLLECTION_PROP);

    collectionName = req.getParams().get(COLLECTION_PROP);
    if (StrUtils.isBlank(collectionName)) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          String.format(
              Locale.ROOT,
              "The " + COLLECTION_PROP + " is required for the Rebalance Leaders command."));
    }
    coreContainer.getZkController().getZkStateReader().forceUpdateCollection(collectionName);
    ClusterState clusterState = coreContainer.getZkController().getClusterState();

    DocCollection dc = clusterState.getCollection(collectionName);
    if (dc == null) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Collection '" + collectionName + "' does not exist, no action taken.");
    }
    return dc;
  }

  // Once we've done all the fiddling with the queues, check on the way out to see if all the active
  // preferred leaders that we intended to change are in fact the leaders.
  private void checkLeaderStatus() throws InterruptedException, KeeperException {
    for (int idx = 0; pendingOps.size() > 0 && idx < 600; ++idx) {
      ClusterState clusterState = coreContainer.getZkController().getClusterState();
      Set liveNodes = clusterState.getLiveNodes();
      DocCollection dc = clusterState.getCollection(collectionName);
      for (Slice slice : dc.getSlices()) {
        for (Replica replica : slice.getReplicas()) {
          if (replica.isActive(liveNodes)
              && replica.getBool(SliceMutator.PREFERRED_LEADER_PROP, false)) {
            if (replica.getBool(LEADER_PROP, false)) {
              if (pendingOps.containsKey(slice.getName())) {
                // Record for return that the leader changed successfully
                pendingOps.remove(slice.getName());
                addToSuccesses(slice, replica);
                break;
              }
            }
          }
        }
      }
      TimeUnit.MILLISECONDS.sleep(100);
      coreContainer.getZkController().getZkStateReader().forciblyRefreshAllClusterStateSlow();
    }
    addAnyFailures();
  }

  // The process is:
  // if the replica with preferredLeader is already the leader, do nothing
  // Otherwise:
  // > if two nodes have the same sequence number and both point to the current leader, we presume
  // that we've just moved it, move the one that does _not_ have the preferredLeader to the end of
  // the list.
  // > move the current leader to the end of the list. This _should_ mean that the current ephemeral
  // node in the leader election queue is removed and the only remaining node watching it is
  // triggered to become leader.
  private void ensurePreferredIsLeader(Slice slice) throws KeeperException, InterruptedException {
    for (Replica replica : slice.getReplicas()) {
      // Tell the replica to become the leader if we're the preferred leader AND active AND not the
      // leader already
      if (replica.getBool(SliceMutator.PREFERRED_LEADER_PROP, false) == false) {
        continue;
      }
      // OK, we are the preferred leader, are we the actual leader?
      if (replica.getBool(LEADER_PROP, false)) {
        // We're a preferred leader, but we're _also_ the leader, don't need to do anything.
        addAlreadyLeaderToResults(slice, replica);
        return; // already the leader, do nothing.
      }
      ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader();
      // We're the preferred leader, but someone else is leader. Only become leader if we're active.
      if (replica.isActive(zkStateReader.getClusterState().getLiveNodes()) == false) {
        addInactiveToResults(slice, replica);
        return; // Don't try to become the leader if we're not active!
      }

      List electionNodes =
          OverseerTaskProcessor.getSortedElectionNodes(
              zkStateReader.getZkClient(),
              ZkStateReader.getShardLeadersElectPath(collectionName, slice.getName()));

      if (electionQueueInBadState(electionNodes, slice, replica)) {
        return;
      }

      // Replica is the preferred leader but not the actual leader, do something about that.
      // "Something" is
      // 1> if the preferred leader isn't first in line, tell it to re-queue itself.
      // 2> tell the actual leader to re-queue itself.

      // Ok, the sorting for election nodes is a bit strange. If the sequence numbers are the same,
      // then the whole string is used, but that sorts nodes with the same sequence number by their
      // session IDs from ZK. While this is determinate, it's not quite what we need, so re-queue
      // nodes that aren't us and are watching the leader node.

      String firstWatcher = electionNodes.get(1);

      if (LeaderElector.getNodeName(firstWatcher).equals(replica.getName()) == false) {
        makeReplicaFirstWatcher(slice, replica);
      }

      // This replica should be the leader at the end of the day, so let's record that information
      // to check at the end
      pendingOps.put(slice.getName(), replica.getName());
      String leaderElectionNode = electionNodes.get(0);
      String coreName =
          slice.getReplica(LeaderElector.getNodeName(leaderElectionNode)).getStr(CORE_NAME_PROP);
      rejoinElectionQueue(slice, leaderElectionNode, coreName, false);
      waitForNodeChange(slice, leaderElectionNode);

      return; // Done with this slice, skip the rest of the replicas.
    }
  }

  // Check that the election queue has some members! There really should be two or more for this to
  // make any sense, if there's only one we can't change anything.
  private boolean electionQueueInBadState(
      List electionNodes, Slice slice, Replica replica) {
    // if there's only one node in the queue, should already be leader and we shouldn't be here
    // anyway.
    if (electionNodes.size() < 2) {
      log.warn(
          "Rebalancing leaders and slice {} has less than two elements in the leader election queue, but replica {} doesn't think it's the leader.",
          slice.getName(),
          replica.getName());
      return true;
    }

    return false;
  }

  // Provide some feedback to the user about what actually happened, or in this case where no action
  // was possible
  private void addInactiveToResults(Slice slice, Replica replica) {
    SimpleOrderedMap> inactives = results.get(INACTIVE_PREFERREDS);
    if (inactives == null) {
      inactives = new SimpleOrderedMap<>();
      results.add(INACTIVE_PREFERREDS, inactives);
    }
    SimpleOrderedMap res = new SimpleOrderedMap<>();
    res.add("status", "skipped");
    res.add(
        "msg",
        "Replica "
            + replica.getName()
            + " is a referredLeader for shard "
            + slice.getName()
            + ", but is inactive. No change necessary");
    inactives.add(replica.getName(), res);
  }

  // Provide some feedback to the user about what actually happened, or in this case where no action
  // was necesary since this preferred replica was already the leader
  private void addAlreadyLeaderToResults(Slice slice, Replica replica) {
    SimpleOrderedMap> alreadyLeaders = results.get(ALREADY_LEADERS);
    if (alreadyLeaders == null) {
      alreadyLeaders = new SimpleOrderedMap<>();
      results.add(ALREADY_LEADERS, alreadyLeaders);
    }
    SimpleOrderedMap res = new SimpleOrderedMap<>();
    res.add("status", "skipped");
    res.add(
        "msg",
        "Replica "
            + replica.getName()
            + " is already the leader for shard "
            + slice.getName()
            + ". No change necessary");
    alreadyLeaders.add(replica.getName(), res);
  }

  // Put the replica in at the head of the queue and send all nodes with the same sequence number to
  // the back of the list. There can be "ties", i.e. replicas in the queue with the same sequence
  // number. Sorting doesn't necessarily sort the one we most care about first. So put the node we
  // _don't care about at the end of the election queue_

  void makeReplicaFirstWatcher(Slice slice, Replica replica)
      throws KeeperException, InterruptedException {

    ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader();
    List electionNodes =
        OverseerTaskProcessor.getSortedElectionNodes(
            zkStateReader.getZkClient(),
            ZkStateReader.getShardLeadersElectPath(collectionName, slice.getName()));

    // First, queue up the preferred leader watching the leader if it isn't already
    int secondSeq = Integer.MAX_VALUE;

    int candidateSeq = -1;
    for (int idx = 1; idx < electionNodes.size(); ++idx) {
      String candidate = electionNodes.get(idx);
      secondSeq = Math.min(secondSeq, LeaderElector.getSeq(candidate));
      if (LeaderElector.getNodeName(candidate).equals(replica.getName())) {
        candidateSeq = LeaderElector.getSeq(candidate);
      }
    }
    int newSeq = -1;
    if (candidateSeq == secondSeq) {
      // the preferredLeader is already watching the leader, no need to move it around.
      newSeq = secondSeq;
    } else {
      for (String electionNode : electionNodes) {
        if (LeaderElector.getNodeName(electionNode).equals(replica.getName())) {
          // Make the preferred leader watch the leader.
          String coreName =
              slice.getReplica(LeaderElector.getNodeName(electionNode)).getStr(CORE_NAME_PROP);
          rejoinElectionQueue(slice, electionNode, coreName, true);
          newSeq = waitForNodeChange(slice, electionNode);
          break;
        }
      }
    }
    if (newSeq == -1) {
      return; // let's not continue if we didn't get what we expect. Possibly we're offline etc..
    }

    // Now find other nodes that have the same sequence number as this node and re-queue them at the
    // end of the queue.
    electionNodes =
        OverseerTaskProcessor.getSortedElectionNodes(
            zkStateReader.getZkClient(),
            ZkStateReader.getShardLeadersElectPath(collectionName, slice.getName()));

    for (String thisNode : electionNodes) {
      if (LeaderElector.getSeq(thisNode) > newSeq) {
        break;
      }
      if (LeaderElector.getNodeName(thisNode).equals(replica.getName())) {
        continue;
      }
      // We won't get here for the preferredLeader node
      if (LeaderElector.getSeq(thisNode) == newSeq) {
        String coreName =
            slice.getReplica(LeaderElector.getNodeName(thisNode)).getStr(CORE_NAME_PROP);
        rejoinElectionQueue(slice, thisNode, coreName, false);
        waitForNodeChange(slice, thisNode);
      }
    }
  }

  // We're just waiting for the electionNode to rejoin the queue with a _different_ node, indicating
  // that any requeueing we've done has happened.
  int waitForNodeChange(Slice slice, String electionNode)
      throws InterruptedException, KeeperException {
    String nodeName = LeaderElector.getNodeName(electionNode);
    int oldSeq = LeaderElector.getSeq(electionNode);
    for (int idx = 0; idx < 600; ++idx) {

      ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader();
      List electionNodes =
          OverseerTaskProcessor.getSortedElectionNodes(
              zkStateReader.getZkClient(),
              ZkStateReader.getShardLeadersElectPath(collectionName, slice.getName()));
      for (String testNode : electionNodes) {
        if (LeaderElector.getNodeName(testNode).equals(nodeName)
            && oldSeq != LeaderElector.getSeq(testNode)) {
          return LeaderElector.getSeq(testNode);
        }
      }
      TimeUnit.MILLISECONDS.sleep(100);
      zkStateReader.forciblyRefreshAllClusterStateSlow();
    }
    return -1;
  }

  // Move an election node to some other place in the queue. If rejoinAtHead==false, then at the
  // end, otherwise the new node should point at the leader.
  private void rejoinElectionQueue(
      Slice slice, String electionNode, String core, boolean rejoinAtHead)
      throws KeeperException, InterruptedException {
    Replica replica = slice.getReplica(LeaderElector.getNodeName(electionNode));
    final CollectionParams.CollectionAction rebalanceleaders = REBALANCELEADERS;
    Map propMap = new HashMap<>();
    propMap.put(COLLECTION_PROP, collectionName);
    propMap.put(SHARD_ID_PROP, slice.getName());
    propMap.put(QUEUE_OPERATION, rebalanceleaders.toLower());
    propMap.put(CORE_NAME_PROP, core);
    propMap.put(CORE_NODE_NAME_PROP, replica.getName());
    propMap.put(ZkStateReader.NODE_NAME_PROP, replica.getNodeName());
    propMap.put(
        ZkStateReader.BASE_URL_PROP,
        coreContainer
            .getZkController()
            .getZkStateReader()
            .getBaseUrlForNodeName(replica.getNodeName()));
    propMap.put(
        REJOIN_AT_HEAD_PROP, Boolean.toString(rejoinAtHead)); // Get ourselves to be first in line.
    propMap.put(ELECTION_NODE_PROP, electionNode);
    String asyncId = rebalanceleaders.toLower() + "_" + core + "_" + Math.abs(System.nanoTime());
    propMap.put(ASYNC, asyncId);
    asyncRequests.add(asyncId);

    collectionsHandler.submitCollectionApiCommand(
        new ZkNodeProps(propMap), rebalanceleaders); // ignore response; we construct our own
  }

  // maxWaitSecs - How long are we going to wait? Defaults to 30 seconds.
  // waitForAll - if true, do not return until all requests have been processed. "Processed" could
  // mean failure!
  //

  private boolean waitAsyncRequests(final int maxWaitSecs, Boolean waitForAll)
      throws KeeperException, InterruptedException {

    if (asyncRequests.size() == 0) {
      return true;
    }

    for (int idx = 0; idx < maxWaitSecs * 10; ++idx) {
      Iterator iter = asyncRequests.iterator();
      boolean foundChange = false;
      while (iter.hasNext()) {
        String asyncId = iter.next();
        if (coreContainer.getZkController().getOverseerFailureMap().contains(asyncId)) {
          coreContainer.getZkController().getOverseerFailureMap().remove(asyncId);
          coreContainer.getZkController().clearAsyncId(asyncId);
          iter.remove();
          foundChange = true;
        } else if (coreContainer.getZkController().getOverseerCompletedMap().contains(asyncId)) {
          coreContainer.getZkController().getOverseerCompletedMap().remove(asyncId);
          coreContainer.getZkController().clearAsyncId(asyncId);
          iter.remove();
          foundChange = true;
        }
      }
      // We're done if we're processing a few at a time or all requests are processed. We don't want
      // to change, say, 100s of leaders simultaneously. So if the request specifies some limit, and
      // we're at that limit, we want to return to the caller so it can immediately add another
      // request. That's the purpose of the first clause here. Otherwise, of course, just return if
      // all requests are processed.
      if ((foundChange && waitForAll == false) || asyncRequests.size() == 0) {
        return true;
      }
      TimeUnit.MILLISECONDS.sleep(100);
    }
    // If we get here, we've timed out waiting.
    return false;
  }

  // If we actually changed the leader, we should send that fact back in the response.
  private void addToSuccesses(Slice slice, Replica replica) {
    SimpleOrderedMap> successes = results.get("successes");
    if (successes == null) {
      successes = new SimpleOrderedMap<>();
      results.add("successes", successes);
    }
    if (log.isInfoEnabled()) {
      log.info(
          "Successfully changed leader of shard {} to replica {}",
          slice.getName(),
          replica.getName());
    }
    SimpleOrderedMap res = new SimpleOrderedMap<>();
    res.add("status", "success");
    res.add(
        "msg",
        "Successfully changed leader of slice " + slice.getName() + " to " + replica.getName());
    successes.add(slice.getName(), res);
  }

  // If for any reason we were supposed to change leadership, that should be recorded in
  // changingLeaders. Any time we verified that the change actually occurred, that entry should have
  // been removed. So report anything left over as a failure.
  private void addAnyFailures() {
    if (pendingOps.size() == 0) {
      return;
    }
    SimpleOrderedMap> fails = new SimpleOrderedMap<>();
    results.add("failures", fails);

    for (Map.Entry ent : pendingOps.entrySet()) {
      if (log.isInfoEnabled()) {
        log.info("Failed to change leader of shard {} to replica {}", ent.getKey(), ent.getValue());
      }
      SimpleOrderedMap res = new SimpleOrderedMap<>();
      res.add("status", "failed");
      res.add(
          "msg",
          String.format(
              Locale.ROOT,
              "Could not change leder for slice %s to %s",
              ent.getKey(),
              ent.getValue()));
      fails.add(ent.getKey(), res);
    }
  }
}