org.apache.pinot.broker.routing.instanceselector.ReplicaGroupInstanceSelector Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.pinot.broker.routing.instanceselector;

import java.time.Clock;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.helix.store.zk.ZkHelixPropertyStore;
import org.apache.helix.zookeeper.datamodel.ZNRecord;
import org.apache.pinot.broker.routing.adaptiveserverselector.AdaptiveServerSelector;
import org.apache.pinot.common.metrics.BrokerMetrics;
import org.apache.pinot.common.utils.HashUtil;
import org.apache.pinot.common.utils.config.QueryOptionsUtils;


/**
 * Instance selector for replica-group routing strategy.
 * The selection algorithm will always evenly distribute the traffic to all replicas of each segment, and will select
 * the same index of the enabled instances for all segments with the same number of replicas. The algorithm is very
 * light-weight and will do best effort to select the least servers for the request.
 * 
The algorithm relies on the mirror segment assignment from replica-group segment assignment strategy. With mirror
 * segment assignment, any server in one replica-group will always have a corresponding server in other replica-groups
 * that have the same segments assigned. For an example, if S1 is a server in replica-group 1, and it has mirror server
 * S2 in replica-group 2 and S3 in replica-group 3. All segments assigned to S1 will also be assigned to S2 and S3. In
 * stable scenario (external view matches ideal state), all segments assigned to S1 will have the same enabled instances
 * of [S1, S2, S3] sorted (in alphabetical order). If we always pick the same index of enabled instances for all
 * segments, only one of S1, S2, S3 will be picked, so it is guaranteed that we pick the least server instances for the
 * request (there is no guarantee on choosing servers from the same replica-group though). In transitioning/error
 * scenario (external view does not match ideal state), there is no guarantee on picking the least server instances, but
 * the traffic is guaranteed to be evenly distributed to all available instances to avoid overwhelming hotspot servers.
 *
 If the query option NUM_REPLICA_GROUPS_TO_QUERY is provided, the servers to be picked will be from different
 * replica groups such that segments are evenly distributed amongst the provided value of NUM_REPLICA_GROUPS_TO_QUERY.
 * Thus in case of [S1, S2, S3] if NUM_REPLICA_GROUPS_TO_QUERY = 2, the ReplicaGroup S1 and ReplicaGroup S2 will be
 * selected such that half the segments will come from S1 and other half from S2. If NUM_REPLICA_GROUPS_TO_QUERY value
 * is much greater than available servers, then ReplicaGroupInstanceSelector will behave similar to
 * BalancedInstanceSelector.
 * If AdaptiveServerSelection is enabled, a single snapshot of the server ranking is fetched. This ranking is
 * referenced to pick the best available server for each segment. The algorithm ends up picking the minimum number of
 * servers required to process a query because it references a single snapshot of the server rankings. Currently,
 * NUM_REPLICA_GROUPS_TO_QUERY is not supported is AdaptiveServerSelection is enabled.
 */
public class ReplicaGroupInstanceSelector extends BaseInstanceSelector {

  public ReplicaGroupInstanceSelector(String tableNameWithType, ZkHelixPropertyStore propertyStore,
      BrokerMetrics brokerMetrics, @Nullable AdaptiveServerSelector adaptiveServerSelector, Clock clock) {
    super(tableNameWithType, propertyStore, brokerMetrics, adaptiveServerSelector, clock);
  }

  @Override
  Map select(List segments, int requestId, SegmentStates segmentStates,
      Map queryOptions) {
    if (_adaptiveServerSelector != null) {
      // Adaptive Server Selection is enabled.
      List serverRankList = new ArrayList<>();
      List candidateServers = fetchCandidateServersForQuery(segments, segmentStates);

      // Fetch serverRankList before looping through all the segments. This is important to make sure that we pick
      // the least amount of instances for a query by referring to a single snapshot of the rankings.
      List> serverRankListWithScores =
          _adaptiveServerSelector.fetchServerRankingsWithScores(candidateServers);
      for (Pair entry : serverRankListWithScores) {
        serverRankList.add(entry.getLeft());
      }
      return selectServersUsingAdaptiveServerSelector(segments, requestId, segmentStates, serverRankList);
    } else {
      // Adaptive Server Selection is NOT enabled.
      return selectServersUsingRoundRobin(segments, requestId, segmentStates, queryOptions);
    }
  }

  private Map selectServersUsingRoundRobin(List segments, int requestId,
      SegmentStates segmentStates, Map queryOptions) {
    Map selectedServers = new HashMap<>(HashUtil.getHashMapCapacity(segments.size()));
    Integer numReplicaGroupsToQuery = QueryOptionsUtils.getNumReplicaGroupsToQuery(queryOptions);
    int numReplicaGroups = numReplicaGroupsToQuery == null ? 1 : numReplicaGroupsToQuery;
    int replicaOffset = 0;
    for (String segment : segments) {
      List candidates = segmentStates.getCandidates(segment);
      // NOTE: candidates can be null when there is no enabled instances for the segment, or the instance selector has
      // not been updated (we update all components for routing in sequence)
      if (candidates == null) {
        continue;
      }
      // Round robin selection.
      int numCandidates = candidates.size();
      int instanceIdx = (requestId + replicaOffset) % numCandidates;
      SegmentInstanceCandidate selectedInstance = candidates.get(instanceIdx);
      // Only put online instance.
      // This can only be offline when it is a new segment.
      if (selectedInstance.isOnline()) {
        selectedServers.put(segment, selectedInstance.getInstance());
      }
      if (numReplicaGroups > numCandidates) {
        numReplicaGroups = numCandidates;
      }
      replicaOffset = (replicaOffset + 1) % numReplicaGroups;
    }
    return selectedServers;
  }

  private Map selectServersUsingAdaptiveServerSelector(List segments, int requestId,
      SegmentStates segmentStates, List serverRankList) {
    Map selectedServers = new HashMap<>(HashUtil.getHashMapCapacity(segments.size()));
    for (String segment : segments) {
      // NOTE: candidates can be null when there is no enabled instances for the segment, or the instance selector has
      // not been updated (we update all components for routing in sequence)
      List candidates = segmentStates.getCandidates(segment);
      if (candidates == null) {
        continue;
      }
      // Round Robin.
      int numCandidates = candidates.size();
      int instanceIdx = requestId % numCandidates;
      SegmentInstanceCandidate selectedInstance = candidates.get(instanceIdx);
      // Adaptive Server Selection
      // TODO: Support numReplicaGroupsToQuery with Adaptive Server Selection.
      if (!serverRankList.isEmpty()) {
        int minIdx = Integer.MAX_VALUE;
        for (SegmentInstanceCandidate candidate : candidates) {
          int idx = serverRankList.indexOf(candidate.getInstance());
          if (idx == -1) {
            // Let's use the round-robin approach until stats for all servers are populated.
            selectedInstance = candidates.get(instanceIdx);
            break;
          }
          if (idx < minIdx) {
            minIdx = idx;
            selectedInstance = candidate;
          }
        }
      }
      // Only put online instance.
      // This can only be offline when it is a new segment.
      if (selectedInstance.isOnline()) {
        selectedServers.put(segment, selectedInstance.getInstance());
      }
    }
    return selectedServers;
  }

  private List fetchCandidateServersForQuery(List segments, SegmentStates segmentStates) {
    Set candidateServers = new HashSet<>();
    for (String segment : segments) {
      List candidates = segmentStates.getCandidates(segment);
      if (candidates == null) {
        continue;
      }
      for (SegmentInstanceCandidate candidate : candidates) {
        candidateServers.add(candidate.getInstance());
      }
    }
    return new ArrayList<>(candidateServers);
  }
}