All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyRackFaultTolerant Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.blockmanagement;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.fs.StorageType;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.net.Node;
import org.apache.hadoop.net.NodeBase;

import java.util.*;

/**
 * The class is responsible for choosing the desired number of targets
 * for placing block replicas.
 * The strategy is that it tries its best to place the replicas to most racks.
 */
@InterfaceAudience.Private
public class BlockPlacementPolicyRackFaultTolerant extends BlockPlacementPolicyDefault {

  @Override
  protected int[] getMaxNodesPerRack(int numOfChosen, int numOfReplicas) {
    int clusterSize = clusterMap.getNumOfLeaves();
    int totalNumOfReplicas = numOfChosen + numOfReplicas;
    if (totalNumOfReplicas > clusterSize) {
      numOfReplicas -= (totalNumOfReplicas-clusterSize);
      totalNumOfReplicas = clusterSize;
    }
    // No calculation needed when there is only one rack or picking one node.
    int numOfRacks = clusterMap.getNumOfNonEmptyRacks();
    // HDFS-14527 return default when numOfRacks = 0 to avoid
    // ArithmeticException when calc maxNodesPerRack at following logic.
    if (numOfRacks <= 1 || totalNumOfReplicas <= 1) {
      return new int[] {numOfReplicas, totalNumOfReplicas};
    }
    // If more racks than replicas, put one replica per rack.
    if (totalNumOfReplicas < numOfRacks) {
      return new int[] {numOfReplicas, 1};
    }
    // If more replicas than racks, evenly spread the replicas.
    // This calculation rounds up.
    int maxNodesPerRack = (totalNumOfReplicas - 1) / numOfRacks + 1;
    return new int[] {numOfReplicas, maxNodesPerRack};
  }

  /**
   * Choose numOfReplicas in order:
   * 1. If total replica expected is less than numOfRacks in cluster, it choose
   * randomly.
   * 2. If total replica expected is bigger than numOfRacks, it choose:
   *  2a. Fill each rack exactly (maxNodesPerRack-1) replicas.
   *  2b. For some random racks, place one more replica to each one of them,
   *  until numOfReplicas have been chosen. 
* 3. If after step 2, there are still replicas not placed (due to some * racks have fewer datanodes than maxNodesPerRack), the rest of the replicas * is placed evenly on the rest of the racks who have Datanodes that have * not been placed a replica. * 4. If after step 3, there are still replicas not placed. A * {@link NotEnoughReplicasException} is thrown. *

* For normal setups, step 2 would suffice. So in the end, the difference * of the numbers of replicas for each two racks is no more than 1. * Either way it always prefer local storage. * @return local node of writer */ @Override protected Node chooseTargetInOrder(int numOfReplicas, Node writer, final Set excludedNodes, final long blocksize, final int maxNodesPerRack, final List results, final boolean avoidStaleNodes, final boolean newBlock, EnumMap storageTypes) throws NotEnoughReplicasException { int totalReplicaExpected = results.size() + numOfReplicas; int numOfRacks = clusterMap.getNumOfNonEmptyRacks(); try { if (totalReplicaExpected < numOfRacks || totalReplicaExpected % numOfRacks == 0) { writer = chooseOnce(numOfReplicas, writer, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); return writer; } assert totalReplicaExpected > (maxNodesPerRack -1) * numOfRacks; // Calculate numOfReplicas for filling each rack exactly (maxNodesPerRack-1) // replicas. HashMap rackCounts = new HashMap<>(); for (DatanodeStorageInfo dsInfo : results) { String rack = dsInfo.getDatanodeDescriptor().getNetworkLocation(); Integer count = rackCounts.get(rack); if (count != null) { rackCounts.put(rack, count + 1); } else { rackCounts.put(rack, 1); } } int excess = 0; // Sum of the above (maxNodesPerRack-1) part of nodes in results for (int count : rackCounts.values()) { if (count > maxNodesPerRack -1) { excess += count - (maxNodesPerRack -1); } } numOfReplicas = Math.min(totalReplicaExpected - results.size(), (maxNodesPerRack -1) * numOfRacks - (results.size() - excess)); // Try to spread the replicas as evenly as possible across racks. // This is done by first placing with (maxNodesPerRack-1), then spreading // the remainder by calling again with maxNodesPerRack. writer = chooseOnce(numOfReplicas, writer, new HashSet<>(excludedNodes), blocksize, maxNodesPerRack - 1, results, avoidStaleNodes, storageTypes); // Exclude the chosen nodes for (DatanodeStorageInfo resultStorage : results) { addToExcludedNodes(resultStorage.getDatanodeDescriptor(), excludedNodes); } LOG.trace("Chosen nodes: {}", results); LOG.trace("Excluded nodes: {}", excludedNodes); numOfReplicas = totalReplicaExpected - results.size(); chooseOnce(numOfReplicas, writer, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } catch (NotEnoughReplicasException e) { LOG.warn("Only able to place {} of total expected {}" + " (maxNodesPerRack={}, numOfReplicas={}) nodes " + "evenly across racks, falling back to evenly place on the " + "remaining racks. This may not guarantee rack-level fault " + "tolerance. Please check if the racks are configured properly.", results.size(), totalReplicaExpected, maxNodesPerRack, numOfReplicas); LOG.debug("Caught exception was:", e); chooseEvenlyFromRemainingRacks(writer, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes, totalReplicaExpected, e); } return writer; } /** * Choose as evenly as possible from the racks which have available datanodes. */ private void chooseEvenlyFromRemainingRacks(Node writer, Set excludedNodes, long blocksize, int maxNodesPerRack, List results, boolean avoidStaleNodes, EnumMap storageTypes, int totalReplicaExpected, NotEnoughReplicasException e) throws NotEnoughReplicasException { int numResultsOflastChoose = 0; NotEnoughReplicasException lastException = e; int bestEffortMaxNodesPerRack = maxNodesPerRack; while (results.size() != totalReplicaExpected && numResultsOflastChoose != results.size()) { // Exclude the chosen nodes final Set newExcludeNodes = new HashSet<>(); for (DatanodeStorageInfo resultStorage : results) { addToExcludedNodes(resultStorage.getDatanodeDescriptor(), newExcludeNodes); } LOG.trace("Chosen nodes: {}", results); LOG.trace("Excluded nodes: {}", excludedNodes); LOG.trace("New Excluded nodes: {}", newExcludeNodes); final int numOfReplicas = totalReplicaExpected - results.size(); numResultsOflastChoose = results.size(); try { chooseOnce(numOfReplicas, writer, newExcludeNodes, blocksize, ++bestEffortMaxNodesPerRack, results, avoidStaleNodes, storageTypes); } catch (NotEnoughReplicasException nere) { lastException = nere; } finally { excludedNodes.addAll(newExcludeNodes); } } if (numResultsOflastChoose != totalReplicaExpected) { LOG.debug("Best effort placement failed: expecting {} replicas, only " + "chose {}.", totalReplicaExpected, numResultsOflastChoose); throw lastException; } } /** * Randomly choose numOfReplicas targets from the given scope. * Except that 1st replica prefer local storage. * @return local node of writer. */ private Node chooseOnce(int numOfReplicas, Node writer, final Set excludedNodes, final long blocksize, final int maxNodesPerRack, final List results, final boolean avoidStaleNodes, EnumMap storageTypes) throws NotEnoughReplicasException { if (numOfReplicas == 0) { return writer; } writer = chooseLocalStorage(writer, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes, true) .getDatanodeDescriptor(); if (--numOfReplicas == 0) { return writer; } chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); return writer; } @Override public BlockPlacementStatus verifyBlockPlacement(DatanodeInfo[] locs, int numberOfReplicas) { if (locs == null) locs = DatanodeDescriptor.EMPTY_ARRAY; if (!clusterMap.hasClusterEverBeenMultiRack()) { // only one rack return new BlockPlacementStatusDefault(1, 1, 1); } // Count locations on different racks. Set racks = new HashSet<>(); for (DatanodeInfo dn : locs) { racks.add(dn.getNetworkLocation()); } return new BlockPlacementStatusDefault(racks.size(), numberOfReplicas, clusterMap.getNumOfNonEmptyRacks()); } @Override protected Collection pickupReplicaSet( Collection moreThanOne, Collection exactlyOne, Map> rackMap) { return moreThanOne.isEmpty() ? exactlyOne : moreThanOne; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy