All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hdfs.server.namenode.FSImage Maven / Gradle / Ivy

There is a newer version: 3.4.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.namenode;

import static org.apache.hadoop.util.Time.now;

import java.io.Closeable;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.StorageType;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.LayoutVersion;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockStoragePolicySuite;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.RollingUpgradeStartupOption;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.common.Storage.FormatConfirmable;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.common.Storage.StorageState;
import org.apache.hadoop.hdfs.server.common.Util;
import org.apache.hadoop.hdfs.server.namenode.FSImageStorageInspector.FSImageFile;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
import org.apache.hadoop.hdfs.server.protocol.CheckpointCommand;
import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.util.Canceler;
import org.apache.hadoop.hdfs.util.EnumCounters;
import org.apache.hadoop.hdfs.util.MD5FileUtils;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.util.Time;

import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

/**
 * FSImage handles checkpointing and logging of the namespace edits.
 * 
 */
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class FSImage implements Closeable {
  public static final Log LOG = LogFactory.getLog(FSImage.class.getName());

  protected FSEditLog editLog = null;
  private boolean isUpgradeFinalized = false;

  protected NNStorage storage;
  
  /**
   * The last transaction ID that was either loaded from an image
   * or loaded by loading edits files.
   */
  protected long lastAppliedTxId = 0;

  final private Configuration conf;

  protected NNStorageRetentionManager archivalManager;

  /**
   * The collection of newly added storage directories. These are partially
   * formatted then later fully populated along with a VERSION file.
   * For HA, the second part is done when the next checkpoint is saved.
   * This set will be cleared once a VERSION file is created.
   * For non-HA, a new fsimage will be locally generated along with a new
   * VERSION file. This set is not used for non-HA mode.
   */
  private Set newDirs = null;

  /* Used to make sure there are no concurrent checkpoints for a given txid
   * The checkpoint here could be one of the following operations.
   * a. checkpoint when NN is in standby.
   * b. admin saveNameSpace operation.
   * c. download checkpoint file from any remote checkpointer.
  */
  private final Set currentlyCheckpointing =
      Collections.synchronizedSet(new HashSet());

  /**
   * Construct an FSImage
   * @param conf Configuration
   * @throws IOException if default directories are invalid.
   */
  public FSImage(Configuration conf) throws IOException {
    this(conf,
         FSNamesystem.getNamespaceDirs(conf),
         FSNamesystem.getNamespaceEditsDirs(conf));
  }

  /**
   * Construct the FSImage. Set the default checkpoint directories.
   *
   * Setup storage and initialize the edit log.
   *
   * @param conf Configuration
   * @param imageDirs Directories the image can be stored in.
   * @param editsDirs Directories the editlog can be stored in.
   * @throws IOException if directories are invalid.
   */
  protected FSImage(Configuration conf,
                    Collection imageDirs,
                    List editsDirs)
      throws IOException {
    this.conf = conf;

    storage = new NNStorage(conf, imageDirs, editsDirs);
    if(conf.getBoolean(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_KEY,
                       DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_DEFAULT)) {
      storage.setRestoreFailedStorage(true);
    }

    this.editLog = new FSEditLog(conf, storage, editsDirs);
    
    archivalManager = new NNStorageRetentionManager(conf, storage, editLog);
  }
 
  void format(FSNamesystem fsn, String clusterId) throws IOException {
    long fileCount = fsn.getTotalFiles();
    // Expect 1 file, which is the root inode
    Preconditions.checkState(fileCount == 1,
        "FSImage.format should be called with an uninitialized namesystem, has " +
        fileCount + " files");
    NamespaceInfo ns = NNStorage.newNamespaceInfo();
    LOG.info("Allocated new BlockPoolId: " + ns.getBlockPoolID());
    ns.clusterID = clusterId;
    
    storage.format(ns);
    editLog.formatNonFileJournals(ns);
    saveFSImageInAllDirs(fsn, 0);
  }
  
  /**
   * Check whether the storage directories and non-file journals exist.
   * If running in interactive mode, will prompt the user for each
   * directory to allow them to format anyway. Otherwise, returns
   * false, unless 'force' is specified.
   * 
   * @param force if true, format regardless of whether dirs exist
   * @param interactive prompt the user when a dir exists
   * @return true if formatting should proceed
   * @throws IOException if some storage cannot be accessed
   */
  boolean confirmFormat(boolean force, boolean interactive) throws IOException {
    List confirms = Lists.newArrayList();
    for (StorageDirectory sd : storage.dirIterable(null)) {
      confirms.add(sd);
    }
    
    confirms.addAll(editLog.getFormatConfirmables());
    return Storage.confirmFormat(confirms, force, interactive);
  }
  
  /**
   * Analyze storage directories.
   * Recover from previous transitions if required. 
   * Perform fs state transition if necessary depending on the namespace info.
   * Read storage info. 
   * 
   * @throws IOException
   * @return true if the image needs to be saved or false otherwise
   */
  boolean recoverTransitionRead(StartupOption startOpt, FSNamesystem target,
      MetaRecoveryContext recovery)
      throws IOException {
    assert startOpt != StartupOption.FORMAT : 
      "NameNode formatting should be performed before reading the image";
    
    Collection imageDirs = storage.getImageDirectories();
    Collection editsDirs = editLog.getEditURIs();

    // none of the data dirs exist
    if((imageDirs.size() == 0 || editsDirs.size() == 0) 
                             && startOpt != StartupOption.IMPORT)  
      throw new IOException(
          "All specified directories are not accessible or do not exist.");
    
    // 1. For each data directory calculate its state and 
    // check whether all is consistent before transitioning.
    Map dataDirStates = 
             new HashMap();
    boolean isFormatted = recoverStorageDirs(startOpt, storage, dataDirStates);

    if (LOG.isTraceEnabled()) {
      LOG.trace("Data dir states:\n  " +
        Joiner.on("\n  ").withKeyValueSeparator(": ")
        .join(dataDirStates));
    }
    
    if (!isFormatted && startOpt != StartupOption.ROLLBACK 
                     && startOpt != StartupOption.IMPORT) {
      throw new IOException("NameNode is not formatted.");      
    }


    int layoutVersion = storage.getLayoutVersion();
    if (startOpt == StartupOption.METADATAVERSION) {
      System.out.println("HDFS Image Version: " + layoutVersion);
      System.out.println("Software format version: " +
        HdfsConstants.NAMENODE_LAYOUT_VERSION);
      return false;
    }

    if (layoutVersion < Storage.LAST_PRE_UPGRADE_LAYOUT_VERSION) {
      NNStorage.checkVersionUpgradable(storage.getLayoutVersion());
    }
    if (startOpt != StartupOption.UPGRADE
        && startOpt != StartupOption.UPGRADEONLY
        && !RollingUpgradeStartupOption.STARTED.matches(startOpt)
        && layoutVersion < Storage.LAST_PRE_UPGRADE_LAYOUT_VERSION
        && layoutVersion != HdfsConstants.NAMENODE_LAYOUT_VERSION) {
      throw new IOException(
          "\nFile system image contains an old layout version " 
          + storage.getLayoutVersion() + ".\nAn upgrade to version "
          + HdfsConstants.NAMENODE_LAYOUT_VERSION + " is required.\n"
          + "Please restart NameNode with the \""
          + RollingUpgradeStartupOption.STARTED.getOptionString()
          + "\" option if a rolling upgrade is already started;"
          + " or restart NameNode with the \""
          + StartupOption.UPGRADE.getName() + "\" option to start"
          + " a new upgrade.");
    }
    
    storage.processStartupOptionsForUpgrade(startOpt, layoutVersion);

    // 2. Format unformatted dirs.
    for (Iterator it = storage.dirIterator(); it.hasNext();) {
      StorageDirectory sd = it.next();
      StorageState curState = dataDirStates.get(sd);
      switch(curState) {
      case NON_EXISTENT:
        throw new IOException(StorageState.NON_EXISTENT + 
                              " state cannot be here");
      case NOT_FORMATTED:
        // Create a dir structure, but not the VERSION file. The presence of
        // VERSION is checked in the inspector's needToSave() method and
        // saveNamespace is triggered if it is absent. This will bring
        // the storage state uptodate along with a new VERSION file.
        // If HA is enabled, NNs start up as standby so saveNamespace is not
        // triggered.
        LOG.info("Storage directory " + sd.getRoot() + " is not formatted.");
        LOG.info("Formatting ...");
        sd.clearDirectory(); // create empty currrent dir
        // For non-HA, no further action is needed here, as saveNamespace will
        // take care of the rest.
        if (!target.isHaEnabled()) {
          continue;
        }
        // If HA is enabled, save the dirs to create a version file later when
        // a checkpoint image is saved.
        if (newDirs == null) {
          newDirs = new HashSet();
        }
        newDirs.add(sd);
        break;
      default:
        break;
      }
    }

    // 3. Do transitions
    switch(startOpt) {
    case UPGRADE:
    case UPGRADEONLY:
      doUpgrade(target);
      return false; // upgrade saved image already
    case IMPORT:
      doImportCheckpoint(target);
      return false; // import checkpoint saved image already
    case ROLLBACK:
      throw new AssertionError("Rollback is now a standalone command, "
          + "NameNode should not be starting with this option.");
    case REGULAR:
    default:
      // just load the image
    }
    
    return loadFSImage(target, startOpt, recovery);
  }

  /**
   * Create a VERSION file in the newly added storage directories.
   */
  private void initNewDirs() {
    if (newDirs == null) {
      return;
    }
    for (StorageDirectory sd : newDirs) {
      try {
        storage.writeProperties(sd);
        LOG.info("Wrote VERSION in the new storage, " + sd.getCurrentDir());
      } catch (IOException e) {
        // Failed to create a VERSION file. Report the error.
        storage.reportErrorOnFile(sd.getVersionFile());
      }
    }
    newDirs.clear();
    newDirs = null;
  }

  /**
   * For each storage directory, performs recovery of incomplete transitions
   * (eg. upgrade, rollback, checkpoint) and inserts the directory's storage
   * state into the dataDirStates map.
   * @param dataDirStates output of storage directory states
   * @return true if there is at least one valid formatted storage directory
   */
  public static boolean recoverStorageDirs(StartupOption startOpt,
      NNStorage storage, Map dataDirStates)
      throws IOException {
    boolean isFormatted = false;
    // This loop needs to be over all storage dirs, even shared dirs, to make
    // sure that we properly examine their state, but we make sure we don't
    // mutate the shared dir below in the actual loop.
    for (Iterator it = 
                      storage.dirIterator(); it.hasNext();) {
      StorageDirectory sd = it.next();
      StorageState curState;
      if (startOpt == StartupOption.METADATAVERSION) {
        /* All we need is the layout version. */
        storage.readProperties(sd);
        return true;
      }

      try {
        curState = sd.analyzeStorage(startOpt, storage);
        // sd is locked but not opened
        switch(curState) {
        case NON_EXISTENT:
          // name-node fails if any of the configured storage dirs are missing
          throw new InconsistentFSStateException(sd.getRoot(),
                      "storage directory does not exist or is not accessible.");
        case NOT_FORMATTED:
          break;
        case NORMAL:
          break;
        default:  // recovery is possible
          sd.doRecover(curState);
        }
        if (curState != StorageState.NOT_FORMATTED 
            && startOpt != StartupOption.ROLLBACK) {
          // read and verify consistency with other directories
          storage.readProperties(sd, startOpt);
          isFormatted = true;
        }
        if (startOpt == StartupOption.IMPORT && isFormatted)
          // import of a checkpoint is allowed only into empty image directories
          throw new IOException("Cannot import image from a checkpoint. " 
              + " NameNode already contains an image in " + sd.getRoot());
      } catch (IOException ioe) {
        sd.unlock();
        throw ioe;
      }
      dataDirStates.put(sd,curState);
    }
    return isFormatted;
  }

  /** Check if upgrade is in progress. */
  public static void checkUpgrade(NNStorage storage) throws IOException {
    // Upgrade or rolling upgrade is allowed only if there are 
    // no previous fs states in any of the directories
    for (Iterator it = storage.dirIterator(false); it.hasNext();) {
      StorageDirectory sd = it.next();
      if (sd.getPreviousDir().exists())
        throw new InconsistentFSStateException(sd.getRoot(),
            "previous fs state should not exist during upgrade. "
            + "Finalize or rollback first.");
    }
  }

  void checkUpgrade() throws IOException {
    checkUpgrade(storage);
  }

  /**
   * @return true if there is rollback fsimage (for rolling upgrade) in NameNode
   * directory.
   */
  public boolean hasRollbackFSImage() throws IOException {
    final FSImageStorageInspector inspector = new FSImageTransactionalStorageInspector(
        EnumSet.of(NameNodeFile.IMAGE_ROLLBACK));
    storage.inspectStorageDirs(inspector);
    try {
      List images = inspector.getLatestImages();
      return images != null && !images.isEmpty();
    } catch (FileNotFoundException e) {
      return false;
    }
  }

  void doUpgrade(FSNamesystem target) throws IOException {
    checkUpgrade();

    // load the latest image
    this.loadFSImage(target, StartupOption.UPGRADE, null);

    // Do upgrade for each directory
    target.checkRollingUpgrade("upgrade namenode");
    
    long oldCTime = storage.getCTime();
    storage.cTime = now();  // generate new cTime for the state
    int oldLV = storage.getLayoutVersion();
    storage.layoutVersion = HdfsConstants.NAMENODE_LAYOUT_VERSION;
    
    List errorSDs =
      Collections.synchronizedList(new ArrayList());
    assert !editLog.isSegmentOpen() : "Edits log must not be open.";
    LOG.info("Starting upgrade of local storage directories."
        + "\n   old LV = " + oldLV
        + "; old CTime = " + oldCTime
        + ".\n   new LV = " + storage.getLayoutVersion()
        + "; new CTime = " + storage.getCTime());
    // Do upgrade for each directory
    for (Iterator it = storage.dirIterator(false); it.hasNext();) {
      StorageDirectory sd = it.next();
      try {
        NNUpgradeUtil.doPreUpgrade(conf, sd);
      } catch (Exception e) {
        LOG.error("Failed to move aside pre-upgrade storage " +
            "in image directory " + sd.getRoot(), e);
        errorSDs.add(sd);
        continue;
      }
    }
    if (target.isHaEnabled()) {
      editLog.doPreUpgradeOfSharedLog();
    }
    storage.reportErrorsOnDirectories(errorSDs);
    errorSDs.clear();

    saveFSImageInAllDirs(target, editLog.getLastWrittenTxId());

    // upgrade shared edit storage first
    if (target.isHaEnabled()) {
      editLog.doUpgradeOfSharedLog();
    }
    for (Iterator it = storage.dirIterator(false); it.hasNext();) {
      StorageDirectory sd = it.next();
      try {
        NNUpgradeUtil.doUpgrade(sd, storage);
      } catch (IOException ioe) {
        errorSDs.add(sd);
        continue;
      }
    }
    storage.reportErrorsOnDirectories(errorSDs);
    
    isUpgradeFinalized = false;
    if (!storage.getRemovedStorageDirs().isEmpty()) {
      // during upgrade, it's a fatal error to fail any storage directory
      throw new IOException("Upgrade failed in "
          + storage.getRemovedStorageDirs().size()
          + " storage directory(ies), previously logged.");
    }
  }

  void doRollback(FSNamesystem fsns) throws IOException {
    // Rollback is allowed only if there is 
    // a previous fs states in at least one of the storage directories.
    // Directories that don't have previous state do not rollback
    boolean canRollback = false;
    FSImage prevState = new FSImage(conf);
    try {
      prevState.getStorage().layoutVersion = HdfsConstants.NAMENODE_LAYOUT_VERSION;
      for (Iterator it = storage.dirIterator(false); it.hasNext();) {
        StorageDirectory sd = it.next();
        if (!NNUpgradeUtil.canRollBack(sd, storage, prevState.getStorage(),
            HdfsConstants.NAMENODE_LAYOUT_VERSION)) {
          continue;
        }
        LOG.info("Can perform rollback for " + sd);
        canRollback = true;
      }
      
      if (fsns.isHaEnabled()) {
        // If HA is enabled, check if the shared log can be rolled back as well.
        editLog.initJournalsForWrite();
        boolean canRollBackSharedEditLog = editLog.canRollBackSharedLog(
            prevState.getStorage(), HdfsConstants.NAMENODE_LAYOUT_VERSION);
        if (canRollBackSharedEditLog) {
          LOG.info("Can perform rollback for shared edit log.");
          canRollback = true;
        }
      }
      
      if (!canRollback)
        throw new IOException("Cannot rollback. None of the storage "
            + "directories contain previous fs state.");
  
      // Now that we know all directories are going to be consistent
      // Do rollback for each directory containing previous state
      for (Iterator it = storage.dirIterator(false); it.hasNext();) {
        StorageDirectory sd = it.next();
        LOG.info("Rolling back storage directory " + sd.getRoot()
                 + ".\n   new LV = " + prevState.getStorage().getLayoutVersion()
                 + "; new CTime = " + prevState.getStorage().getCTime());
        NNUpgradeUtil.doRollBack(sd);
      }
      if (fsns.isHaEnabled()) {
        // If HA is enabled, try to roll back the shared log as well.
        editLog.doRollback();
      }
      
      isUpgradeFinalized = true;
    } finally {
      prevState.close();
    }
  }

  /**
   * Load image from a checkpoint directory and save it into the current one.
   * @param target the NameSystem to import into
   * @throws IOException
   */
  void doImportCheckpoint(FSNamesystem target) throws IOException {
    Collection checkpointDirs =
      FSImage.getCheckpointDirs(conf, null);
    List checkpointEditsDirs =
      FSImage.getCheckpointEditsDirs(conf, null);

    if (checkpointDirs == null || checkpointDirs.isEmpty()) {
      throw new IOException("Cannot import image from a checkpoint. "
                            + "\"dfs.namenode.checkpoint.dir\" is not set." );
    }
    
    if (checkpointEditsDirs == null || checkpointEditsDirs.isEmpty()) {
      throw new IOException("Cannot import image from a checkpoint. "
                            + "\"dfs.namenode.checkpoint.dir\" is not set." );
    }

    FSImage realImage = target.getFSImage();
    FSImage ckptImage = new FSImage(conf, 
                                    checkpointDirs, checkpointEditsDirs);
    // load from the checkpoint dirs
    try {
      ckptImage.recoverTransitionRead(StartupOption.REGULAR, target, null);
    } finally {
      ckptImage.close();
    }
    // return back the real image
    realImage.getStorage().setStorageInfo(ckptImage.getStorage());
    realImage.getEditLog().setNextTxId(ckptImage.getEditLog().getLastWrittenTxId()+1);
    realImage.initEditLog(StartupOption.IMPORT);

    realImage.getStorage().setBlockPoolID(ckptImage.getBlockPoolID());

    // and save it but keep the same checkpointTime
    saveNamespace(target);
    getStorage().writeAll();
  }
  
  void finalizeUpgrade(boolean finalizeEditLog) throws IOException {
    LOG.info("Finalizing upgrade for local dirs. " +
        (storage.getLayoutVersion() == 0 ? "" : 
          "\n   cur LV = " + storage.getLayoutVersion()
          + "; cur CTime = " + storage.getCTime()));
    for (Iterator it = storage.dirIterator(false); it.hasNext();) {
      StorageDirectory sd = it.next();
      NNUpgradeUtil.doFinalize(sd);
    }
    if (finalizeEditLog) {
      // We only do this in the case that HA is enabled and we're active. In any
      // other case the NN will have done the upgrade of the edits directories
      // already by virtue of the fact that they're local.
      editLog.doFinalizeOfSharedLog();
    }
    isUpgradeFinalized = true;
  }

  boolean isUpgradeFinalized() {
    return isUpgradeFinalized;
  }

  public FSEditLog getEditLog() {
    return editLog;
  }

  void openEditLogForWrite() throws IOException {
    assert editLog != null : "editLog must be initialized";
    editLog.openForWrite();
    storage.writeTransactionIdFileToStorage(editLog.getCurSegmentTxId());
  }
  
  /**
   * Toss the current image and namesystem, reloading from the specified
   * file.
   */
  void reloadFromImageFile(File file, FSNamesystem target) throws IOException {
    target.clear();
    LOG.debug("Reloading namespace from " + file);
    loadFSImage(file, target, null, false);
  }

  /**
   * Choose latest image from one of the directories,
   * load it and merge with the edits.
   * 
   * Saving and loading fsimage should never trigger symlink resolution. 
   * The paths that are persisted do not have *intermediate* symlinks 
   * because intermediate symlinks are resolved at the time files, 
   * directories, and symlinks are created. All paths accessed while 
   * loading or saving fsimage should therefore only see symlinks as 
   * the final path component, and the functions called below do not
   * resolve symlinks that are the final path component.
   *
   * @return whether the image should be saved
   * @throws IOException
   */
  private boolean loadFSImage(FSNamesystem target, StartupOption startOpt,
      MetaRecoveryContext recovery)
      throws IOException {
    final boolean rollingRollback
        = RollingUpgradeStartupOption.ROLLBACK.matches(startOpt);
    final EnumSet nnfs;
    if (rollingRollback) {
      // if it is rollback of rolling upgrade, only load from the rollback image
      nnfs = EnumSet.of(NameNodeFile.IMAGE_ROLLBACK);
    } else {
      // otherwise we can load from both IMAGE and IMAGE_ROLLBACK
      nnfs = EnumSet.of(NameNodeFile.IMAGE, NameNodeFile.IMAGE_ROLLBACK);
    }
    final FSImageStorageInspector inspector = storage
        .readAndInspectDirs(nnfs, startOpt);

    isUpgradeFinalized = inspector.isUpgradeFinalized();
    List imageFiles = inspector.getLatestImages();

    StartupProgress prog = NameNode.getStartupProgress();
    prog.beginPhase(Phase.LOADING_FSIMAGE);
    File phaseFile = imageFiles.get(0).getFile();
    prog.setFile(Phase.LOADING_FSIMAGE, phaseFile.getAbsolutePath());
    prog.setSize(Phase.LOADING_FSIMAGE, phaseFile.length());
    boolean needToSave = inspector.needToSave();

    Iterable editStreams = null;

    initEditLog(startOpt);

    if (NameNodeLayoutVersion.supports(
        LayoutVersion.Feature.TXID_BASED_LAYOUT, getLayoutVersion())) {
      // If we're open for write, we're either non-HA or we're the active NN, so
      // we better be able to load all the edits. If we're the standby NN, it's
      // OK to not be able to read all of edits right now.
      // In the meanwhile, for HA upgrade, we will still write editlog thus need
      // this toAtLeastTxId to be set to the max-seen txid
      // For rollback in rolling upgrade, we need to set the toAtLeastTxId to
      // the txid right before the upgrade marker.  
      long toAtLeastTxId = editLog.isOpenForWrite() ? inspector
          .getMaxSeenTxId() : 0;
      if (rollingRollback) {
        // note that the first image in imageFiles is the special checkpoint
        // for the rolling upgrade
        toAtLeastTxId = imageFiles.get(0).getCheckpointTxId() + 2;
      }
      editStreams = editLog.selectInputStreams(
          imageFiles.get(0).getCheckpointTxId() + 1,
          toAtLeastTxId, recovery, false);
    } else {
      editStreams = FSImagePreTransactionalStorageInspector
        .getEditLogStreams(storage);
    }
    int maxOpSize = conf.getInt(DFSConfigKeys.DFS_NAMENODE_MAX_OP_SIZE_KEY,
        DFSConfigKeys.DFS_NAMENODE_MAX_OP_SIZE_DEFAULT);
    for (EditLogInputStream elis : editStreams) {
      elis.setMaxOpSize(maxOpSize);
    }
 
    for (EditLogInputStream l : editStreams) {
      LOG.debug("Planning to load edit log stream: " + l);
    }
    if (!editStreams.iterator().hasNext()) {
      LOG.info("No edit log streams selected.");
    }
    
    Exception le = null;
    FSImageFile imageFile = null;
    for (int i = 0; i < imageFiles.size(); i++) {
      try {
        imageFile = imageFiles.get(i);
        loadFSImageFile(target, recovery, imageFile, startOpt);
        break;
      } catch (IllegalReservedPathException ie) {
        throw new IOException("Failed to load image from " + imageFile,
            ie);
      } catch (Exception e) {
        le = e;
        LOG.error("Failed to load image from " + imageFile, e);
        target.clear();
        imageFile = null;
      }
    }
    // Failed to load any images, error out
    if (imageFile == null) {
      FSEditLog.closeAllStreams(editStreams);
      throw new IOException("Failed to load FSImage file, see error(s) " +
          "above for more info.");
    }
    prog.endPhase(Phase.LOADING_FSIMAGE);
    
    if (!rollingRollback) {
      long txnsAdvanced = loadEdits(editStreams, target, startOpt, recovery);
      needToSave |= needsResaveBasedOnStaleCheckpoint(imageFile.getFile(),
          txnsAdvanced);
      if (RollingUpgradeStartupOption.DOWNGRADE.matches(startOpt)) {
        // rename rollback image if it is downgrade
        renameCheckpoint(NameNodeFile.IMAGE_ROLLBACK, NameNodeFile.IMAGE);
      }
    } else {
      // Trigger the rollback for rolling upgrade. Here lastAppliedTxId equals
      // to the last txid in rollback fsimage.
      rollingRollback(lastAppliedTxId + 1, imageFiles.get(0).getCheckpointTxId());
      needToSave = false;
    }
    editLog.setNextTxId(lastAppliedTxId + 1);
    return needToSave;
  }

  /** rollback for rolling upgrade. */
  private void rollingRollback(long discardSegmentTxId, long ckptId)
      throws IOException {
    // discard discard unnecessary editlog segments starting from the given id
    this.editLog.discardSegments(discardSegmentTxId);
    // rename the special checkpoint
    renameCheckpoint(ckptId, NameNodeFile.IMAGE_ROLLBACK, NameNodeFile.IMAGE,
        true);
    // purge all the checkpoints after the marker
    archivalManager.purgeCheckpoinsAfter(NameNodeFile.IMAGE, ckptId);
    String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
    if (HAUtil.isHAEnabled(conf, nameserviceId)) {
      // close the editlog since it is currently open for write
      this.editLog.close();
      // reopen the editlog for read
      this.editLog.initSharedJournalsForRead();
    }
  }

  void loadFSImageFile(FSNamesystem target, MetaRecoveryContext recovery,
      FSImageFile imageFile, StartupOption startupOption) throws IOException {
    LOG.info("Planning to load image: " + imageFile);
    StorageDirectory sdForProperties = imageFile.sd;
    storage.readProperties(sdForProperties, startupOption);

    if (NameNodeLayoutVersion.supports(
        LayoutVersion.Feature.TXID_BASED_LAYOUT, getLayoutVersion())) {
      // For txid-based layout, we should have a .md5 file
      // next to the image file
      boolean isRollingRollback = RollingUpgradeStartupOption.ROLLBACK
          .matches(startupOption);
      loadFSImage(imageFile.getFile(), target, recovery, isRollingRollback);
    } else if (NameNodeLayoutVersion.supports(
        LayoutVersion.Feature.FSIMAGE_CHECKSUM, getLayoutVersion())) {
      // In 0.22, we have the checksum stored in the VERSION file.
      String md5 = storage.getDeprecatedProperty(
          NNStorage.DEPRECATED_MESSAGE_DIGEST_PROPERTY);
      if (md5 == null) {
        throw new InconsistentFSStateException(sdForProperties.getRoot(),
            "Message digest property " +
            NNStorage.DEPRECATED_MESSAGE_DIGEST_PROPERTY +
            " not set for storage directory " + sdForProperties.getRoot());
      }
      loadFSImage(imageFile.getFile(), new MD5Hash(md5), target, recovery,
          false);
    } else {
      // We don't have any record of the md5sum
      loadFSImage(imageFile.getFile(), null, target, recovery, false);
    }
  }

  public void initEditLog(StartupOption startOpt) throws IOException {
    Preconditions.checkState(getNamespaceID() != 0,
        "Must know namespace ID before initting edit log");
    String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
    if (!HAUtil.isHAEnabled(conf, nameserviceId)) {
      // If this NN is not HA
      editLog.initJournalsForWrite();
      editLog.recoverUnclosedStreams();
    } else if (HAUtil.isHAEnabled(conf, nameserviceId)
        && (startOpt == StartupOption.UPGRADE
            || startOpt == StartupOption.UPGRADEONLY
            || RollingUpgradeStartupOption.ROLLBACK.matches(startOpt))) {
      // This NN is HA, but we're doing an upgrade or a rollback of rolling
      // upgrade so init the edit log for write.
      editLog.initJournalsForWrite();
      if (startOpt == StartupOption.UPGRADE
          || startOpt == StartupOption.UPGRADEONLY) {
        long sharedLogCTime = editLog.getSharedLogCTime();
        if (this.storage.getCTime() < sharedLogCTime) {
          throw new IOException("It looks like the shared log is already " +
              "being upgraded but this NN has not been upgraded yet. You " +
              "should restart this NameNode with the '" +
              StartupOption.BOOTSTRAPSTANDBY.getName() + "' option to bring " +
              "this NN in sync with the other.");
        }
      }
      editLog.recoverUnclosedStreams();
    } else {
      // This NN is HA and we're not doing an upgrade.
      editLog.initSharedJournalsForRead();
    }
  }

  /**
   * @param imageFile the image file that was loaded
   * @param numEditsLoaded the number of edits loaded from edits logs
   * @return true if the NameNode should automatically save the namespace
   * when it is started, due to the latest checkpoint being too old.
   */
  private boolean needsResaveBasedOnStaleCheckpoint(
      File imageFile, long numEditsLoaded) {
    final long checkpointPeriod = conf.getLong(
        DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 
        DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT);
    final long checkpointTxnCount = conf.getLong(
        DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 
        DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT);
    long checkpointAge = Time.now() - imageFile.lastModified();

    return (checkpointAge > checkpointPeriod * 1000) ||
           (numEditsLoaded > checkpointTxnCount);
  }
  
  /**
   * Load the specified list of edit files into the image.
   */
  public long loadEdits(Iterable editStreams,
      FSNamesystem target) throws IOException {
    return loadEdits(editStreams, target, null, null);
  }

  private long loadEdits(Iterable editStreams,
      FSNamesystem target, StartupOption startOpt, MetaRecoveryContext recovery)
      throws IOException {
    LOG.debug("About to load edits:\n  " + Joiner.on("\n  ").join(editStreams));
    StartupProgress prog = NameNode.getStartupProgress();
    prog.beginPhase(Phase.LOADING_EDITS);
    
    long prevLastAppliedTxId = lastAppliedTxId;  
    try {    
      FSEditLogLoader loader = new FSEditLogLoader(target, lastAppliedTxId);
      
      // Load latest edits
      for (EditLogInputStream editIn : editStreams) {
        LOG.info("Reading " + editIn + " expecting start txid #" +
              (lastAppliedTxId + 1));
        try {
          loader.loadFSEdits(editIn, lastAppliedTxId + 1, startOpt, recovery);
        } finally {
          // Update lastAppliedTxId even in case of error, since some ops may
          // have been successfully applied before the error.
          lastAppliedTxId = loader.getLastAppliedTxId();
        }
        // If we are in recovery mode, we may have skipped over some txids.
        if (editIn.getLastTxId() != HdfsConstants.INVALID_TXID) {
          lastAppliedTxId = editIn.getLastTxId();
        }
      }
    } finally {
      FSEditLog.closeAllStreams(editStreams);
      // update the counts
      updateCountForQuota(target.getBlockManager().getStoragePolicySuite(),
          target.dir.rootDir);
    }
    prog.endPhase(Phase.LOADING_EDITS);
    return lastAppliedTxId - prevLastAppliedTxId;
  }

  /**
   * Update the count of each directory with quota in the namespace.
   * A directory's count is defined as the total number inodes in the tree
   * rooted at the directory.
   * 
   * This is an update of existing state of the filesystem and does not
   * throw QuotaExceededException.
   */
  static void updateCountForQuota(BlockStoragePolicySuite bsps,
                                  INodeDirectory root) {
    updateCountForQuotaRecursively(bsps, root.getStoragePolicyID(), root,
        new QuotaCounts.Builder().build());
 }

  private static void updateCountForQuotaRecursively(BlockStoragePolicySuite bsps,
      byte blockStoragePolicyId, INodeDirectory dir, QuotaCounts counts) {
    final long parentNamespace = counts.getNameSpace();
    final long parentStoragespace = counts.getStorageSpace();
    final EnumCounters parentTypeSpaces = counts.getTypeSpaces();

    dir.computeQuotaUsage4CurrentDirectory(bsps, blockStoragePolicyId, counts);
    
    for (INode child : dir.getChildrenList(Snapshot.CURRENT_STATE_ID)) {
      final byte childPolicyId = child.getStoragePolicyIDForQuota(blockStoragePolicyId);
      if (child.isDirectory()) {
        updateCountForQuotaRecursively(bsps, childPolicyId,
            child.asDirectory(), counts);
      } else {
        // file or symlink: count here to reduce recursive calls.
        child.computeQuotaUsage(bsps, childPolicyId, counts, false,
            Snapshot.CURRENT_STATE_ID);
      }
    }
      
    if (dir.isQuotaSet()) {
      // check if quota is violated. It indicates a software bug.
      final QuotaCounts q = dir.getQuotaCounts();

      final long namespace = counts.getNameSpace() - parentNamespace;
      final long nsQuota = q.getNameSpace();
      if (Quota.isViolated(nsQuota, namespace)) {
        LOG.warn("Namespace quota violation in image for "
            + dir.getFullPathName()
            + " quota = " + nsQuota + " < consumed = " + namespace);
      }

      final long ssConsumed = counts.getStorageSpace() - parentStoragespace;
      final long ssQuota = q.getStorageSpace();
      if (Quota.isViolated(ssQuota, ssConsumed)) {
        LOG.warn("Storagespace quota violation in image for "
            + dir.getFullPathName()
            + " quota = " + ssQuota + " < consumed = " + ssConsumed);
      }

      final EnumCounters typeSpaces = counts.getTypeSpaces();
      for (StorageType t : StorageType.getTypesSupportingQuota()) {
        final long typeSpace = typeSpaces.get(t) - parentTypeSpaces.get(t);
        final long typeQuota = q.getTypeSpaces().get(t);
        if (Quota.isViolated(typeQuota, typeSpace)) {
          LOG.warn("Storage type quota violation in image for "
              + dir.getFullPathName()
              + " type = " + t.toString() + " quota = "
              + typeQuota + " < consumed " + typeSpace);
        }
      }

      dir.getDirectoryWithQuotaFeature().setSpaceConsumed(namespace, ssConsumed,
          typeSpaces);
    }
  }

  /**
   * Load the image namespace from the given image file, verifying
   * it against the MD5 sum stored in its associated .md5 file.
   */
  private void loadFSImage(File imageFile, FSNamesystem target,
      MetaRecoveryContext recovery, boolean requireSameLayoutVersion)
      throws IOException {
    MD5Hash expectedMD5 = MD5FileUtils.readStoredMd5ForFile(imageFile);
    if (expectedMD5 == null) {
      throw new IOException("No MD5 file found corresponding to image file "
          + imageFile);
    }
    loadFSImage(imageFile, expectedMD5, target, recovery,
        requireSameLayoutVersion);
  }
  
  /**
   * Load in the filesystem image from file. It's a big list of
   * filenames and blocks.
   */
  private void loadFSImage(File curFile, MD5Hash expectedMd5,
      FSNamesystem target, MetaRecoveryContext recovery,
      boolean requireSameLayoutVersion) throws IOException {
    // BlockPoolId is required when the FsImageLoader loads the rolling upgrade
    // information. Make sure the ID is properly set.
    target.setBlockPoolId(this.getBlockPoolID());

    FSImageFormat.LoaderDelegator loader = FSImageFormat.newLoader(conf, target);
    loader.load(curFile, requireSameLayoutVersion);

    // Check that the image digest we loaded matches up with what
    // we expected
    MD5Hash readImageMd5 = loader.getLoadedImageMd5();
    if (expectedMd5 != null &&
        !expectedMd5.equals(readImageMd5)) {
      throw new IOException("Image file " + curFile +
          " is corrupt with MD5 checksum of " + readImageMd5 +
          " but expecting " + expectedMd5);
    }

    long txId = loader.getLoadedImageTxId();
    LOG.info("Loaded image for txid " + txId + " from " + curFile);
    lastAppliedTxId = txId;
    storage.setMostRecentCheckpointInfo(txId, curFile.lastModified());
  }

  /**
   * Save the contents of the FS image to the file.
   */
  void saveFSImage(SaveNamespaceContext context, StorageDirectory sd,
      NameNodeFile dstType) throws IOException {
    long txid = context.getTxId();
    File newFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid);
    File dstFile = NNStorage.getStorageFile(sd, dstType, txid);
    
    FSImageFormatProtobuf.Saver saver = new FSImageFormatProtobuf.Saver(context);
    FSImageCompression compression = FSImageCompression.createCompression(conf);
    saver.save(newFile, compression);
    
    MD5FileUtils.saveMD5File(dstFile, saver.getSavedDigest());
    storage.setMostRecentCheckpointInfo(txid, Time.now());
  }

  /**
   * Save FSimage in the legacy format. This is not for NN consumption,
   * but for tools like OIV.
   */
  public void saveLegacyOIVImage(FSNamesystem source, String targetDir,
      Canceler canceler) throws IOException {
    FSImageCompression compression =
        FSImageCompression.createCompression(conf);
    long txid = getCorrectLastAppliedOrWrittenTxId();
    SaveNamespaceContext ctx = new SaveNamespaceContext(source, txid,
        canceler);
    FSImageFormat.Saver saver = new FSImageFormat.Saver(ctx);
    String imageFileName = NNStorage.getLegacyOIVImageFileName(txid);
    File imageFile = new File(targetDir, imageFileName);
    saver.save(imageFile, compression);
    archivalManager.purgeOldLegacyOIVImages(targetDir, txid);
  }


  /**
   * FSImageSaver is being run in a separate thread when saving
   * FSImage. There is one thread per each copy of the image.
   *
   * FSImageSaver assumes that it was launched from a thread that holds
   * FSNamesystem lock and waits for the execution of FSImageSaver thread
   * to finish.
   * This way we are guaranteed that the namespace is not being updated
   * while multiple instances of FSImageSaver are traversing it
   * and writing it out.
   */
  private class FSImageSaver implements Runnable {
    private final SaveNamespaceContext context;
    private final StorageDirectory sd;
    private final NameNodeFile nnf;

    public FSImageSaver(SaveNamespaceContext context, StorageDirectory sd,
        NameNodeFile nnf) {
      this.context = context;
      this.sd = sd;
      this.nnf = nnf;
    }

    @Override
    public void run() {
      try {
        saveFSImage(context, sd, nnf);
      } catch (SaveNamespaceCancelledException snce) {
        LOG.info("Cancelled image saving for " + sd.getRoot() +
            ": " + snce.getMessage());
        // don't report an error on the storage dir!
      } catch (Throwable t) {
        LOG.error("Unable to save image for " + sd.getRoot(), t);
        context.reportErrorOnStorageDirectory(sd);
      }
    }
    
    @Override
    public String toString() {
      return "FSImageSaver for " + sd.getRoot() +
             " of type " + sd.getStorageDirType();
    }
  }
  
  private void waitForThreads(List threads) {
    for (Thread thread : threads) {
      while (thread.isAlive()) {
        try {
          thread.join();
        } catch (InterruptedException iex) {
          LOG.error("Caught interrupted exception while waiting for thread " +
                    thread.getName() + " to finish. Retrying join");
        }        
      }
    }
  }
  
  /**
   * Update version of all storage directories.
   */
  public synchronized void updateStorageVersion() throws IOException {
    storage.writeAll();
  }

  /**
   * @see #saveNamespace(FSNamesystem, NameNodeFile, Canceler)
   */
  public synchronized void saveNamespace(FSNamesystem source)
      throws IOException {
    saveNamespace(source, NameNodeFile.IMAGE, null);
  }
  
  /**
   * Save the contents of the FS image to a new image file in each of the
   * current storage directories.
   */
  public synchronized void saveNamespace(FSNamesystem source, NameNodeFile nnf,
      Canceler canceler) throws IOException {
    assert editLog != null : "editLog must be initialized";
    LOG.info("Save namespace ...");
    storage.attemptRestoreRemovedStorage();

    boolean editLogWasOpen = editLog.isSegmentOpen();
    
    if (editLogWasOpen) {
      editLog.endCurrentLogSegment(true);
    }
    long imageTxId = getCorrectLastAppliedOrWrittenTxId();
    if (!addToCheckpointing(imageTxId)) {
      throw new IOException(
          "FS image is being downloaded from another NN at txid " + imageTxId);
    }
    try {
      try {
        saveFSImageInAllDirs(source, nnf, imageTxId, canceler);
        storage.writeAll();
      } finally {
        if (editLogWasOpen) {
          editLog.startLogSegment(imageTxId + 1, true);
          // Take this opportunity to note the current transaction.
          // Even if the namespace save was cancelled, this marker
          // is only used to determine what transaction ID is required
          // for startup. So, it doesn't hurt to update it unnecessarily.
          storage.writeTransactionIdFileToStorage(imageTxId + 1);
        }
      }
    } finally {
      removeFromCheckpointing(imageTxId);
    }
  }

  /**
   * @see #saveFSImageInAllDirs(FSNamesystem, NameNodeFile, long, Canceler)
   */
  protected synchronized void saveFSImageInAllDirs(FSNamesystem source, long txid)
      throws IOException {
    if (!addToCheckpointing(txid)) {
      throw new IOException(("FS image is being downloaded from another NN"));
    }
    try {
      saveFSImageInAllDirs(source, NameNodeFile.IMAGE, txid, null);
    } finally {
      removeFromCheckpointing(txid);
    }
  }

  public boolean addToCheckpointing(long txid) {
    return currentlyCheckpointing.add(txid);
  }

  public void removeFromCheckpointing(long txid) {
    currentlyCheckpointing.remove(txid);
  }

  private synchronized void saveFSImageInAllDirs(FSNamesystem source,
      NameNodeFile nnf, long txid, Canceler canceler) throws IOException {
    StartupProgress prog = NameNode.getStartupProgress();
    prog.beginPhase(Phase.SAVING_CHECKPOINT);
    if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) {
      throw new IOException("No image directories available!");
    }
    if (canceler == null) {
      canceler = new Canceler();
    }
    SaveNamespaceContext ctx = new SaveNamespaceContext(
        source, txid, canceler);
    
    try {
      List saveThreads = new ArrayList();
      // save images into current
      for (Iterator it
             = storage.dirIterator(NameNodeDirType.IMAGE); it.hasNext();) {
        StorageDirectory sd = it.next();
        FSImageSaver saver = new FSImageSaver(ctx, sd, nnf);
        Thread saveThread = new Thread(saver, saver.toString());
        saveThreads.add(saveThread);
        saveThread.start();
      }
      waitForThreads(saveThreads);
      saveThreads.clear();
      storage.reportErrorsOnDirectories(ctx.getErrorSDs());
  
      if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) {
        throw new IOException(
          "Failed to save in any storage directories while saving namespace.");
      }
      if (canceler.isCancelled()) {
        deleteCancelledCheckpoint(txid);
        ctx.checkCancelled(); // throws
        assert false : "should have thrown above!";
      }
  
      renameCheckpoint(txid, NameNodeFile.IMAGE_NEW, nnf, false);
  
      // Since we now have a new checkpoint, we can clean up some
      // old edit logs and checkpoints.
      purgeOldStorage(nnf);
    } finally {
      // Notify any threads waiting on the checkpoint to be canceled
      // that it is complete.
      ctx.markComplete();
      ctx = null;
    }
    prog.endPhase(Phase.SAVING_CHECKPOINT);
  }

  /**
   * Purge any files in the storage directories that are no longer
   * necessary.
   */
  void purgeOldStorage(NameNodeFile nnf) {
    try {
      archivalManager.purgeOldStorage(nnf);
    } catch (Exception e) {
      LOG.warn("Unable to purge old storage " + nnf.getName(), e);
    }
  }

  /**
   * Rename FSImage with the specific txid
   */
  private void renameCheckpoint(long txid, NameNodeFile fromNnf,
      NameNodeFile toNnf, boolean renameMD5) throws IOException {
    ArrayList al = null;

    for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) {
      try {
        renameImageFileInDir(sd, fromNnf, toNnf, txid, renameMD5);
      } catch (IOException ioe) {
        LOG.warn("Unable to rename checkpoint in " + sd, ioe);
        if (al == null) {
          al = Lists.newArrayList();
        }
        al.add(sd);
      }
    }
    if(al != null) storage.reportErrorsOnDirectories(al);
  }

  /**
   * Rename all the fsimage files with the specific NameNodeFile type. The
   * associated checksum files will also be renamed.
   */
  void renameCheckpoint(NameNodeFile fromNnf, NameNodeFile toNnf)
      throws IOException {
    ArrayList al = null;
    FSImageTransactionalStorageInspector inspector =
        new FSImageTransactionalStorageInspector(EnumSet.of(fromNnf));
    storage.inspectStorageDirs(inspector);
    for (FSImageFile image : inspector.getFoundImages()) {
      try {
        renameImageFileInDir(image.sd, fromNnf, toNnf, image.txId, true);
      } catch (IOException ioe) {
        LOG.warn("Unable to rename checkpoint in " + image.sd, ioe);
        if (al == null) {
          al = Lists.newArrayList();
        }
        al.add(image.sd);
      }
    }
    if(al != null) {
      storage.reportErrorsOnDirectories(al);
    }
  }

  /**
   * Deletes the checkpoint file in every storage directory,
   * since the checkpoint was cancelled.
   */
  private void deleteCancelledCheckpoint(long txid) throws IOException {
    ArrayList al = Lists.newArrayList();

    for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) {
      File ckpt = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid);
      if (ckpt.exists() && !ckpt.delete()) {
        LOG.warn("Unable to delete cancelled checkpoint in " + sd);
        al.add(sd);            
      }
    }
    storage.reportErrorsOnDirectories(al);
  }

  private void renameImageFileInDir(StorageDirectory sd, NameNodeFile fromNnf,
      NameNodeFile toNnf, long txid, boolean renameMD5) throws IOException {
    final File fromFile = NNStorage.getStorageFile(sd, fromNnf, txid);
    final File toFile = NNStorage.getStorageFile(sd, toNnf, txid);
    // renameTo fails on Windows if the destination file already exists.
    if(LOG.isDebugEnabled()) {
      LOG.debug("renaming  " + fromFile.getAbsolutePath() 
                + " to " + toFile.getAbsolutePath());
    }
    if (!fromFile.renameTo(toFile)) {
      if (!toFile.delete() || !fromFile.renameTo(toFile)) {
        throw new IOException("renaming  " + fromFile.getAbsolutePath() + " to "  + 
            toFile.getAbsolutePath() + " FAILED");
      }
    }
    if (renameMD5) {
      MD5FileUtils.renameMD5File(fromFile, toFile);
    }
  }

  CheckpointSignature rollEditLog() throws IOException {
    getEditLog().rollEditLog();
    // Record this log segment ID in all of the storage directories, so
    // we won't miss this log segment on a restart if the edits directories
    // go missing.
    storage.writeTransactionIdFileToStorage(getEditLog().getCurSegmentTxId());
    return new CheckpointSignature(this);
  }

  /**
   * Start checkpoint.
   * 

* If backup storage contains image that is newer than or incompatible with * what the active name-node has, then the backup node should shutdown.
* If the backup image is older than the active one then it should * be discarded and downloaded from the active node.
* If the images are the same then the backup image will be used as current. * * @param bnReg the backup node registration. * @param nnReg this (active) name-node registration. * @return {@link NamenodeCommand} if backup node should shutdown or * {@link CheckpointCommand} prescribing what backup node should * do with its image. * @throws IOException */ NamenodeCommand startCheckpoint(NamenodeRegistration bnReg, // backup node NamenodeRegistration nnReg) // active name-node throws IOException { LOG.info("Start checkpoint at txid " + getEditLog().getLastWrittenTxId()); String msg = null; // Verify that checkpoint is allowed if(bnReg.getNamespaceID() != storage.getNamespaceID()) msg = "Name node " + bnReg.getAddress() + " has incompatible namespace id: " + bnReg.getNamespaceID() + " expected: " + storage.getNamespaceID(); else if(bnReg.isRole(NamenodeRole.NAMENODE)) msg = "Name node " + bnReg.getAddress() + " role " + bnReg.getRole() + ": checkpoint is not allowed."; else if(bnReg.getLayoutVersion() < storage.getLayoutVersion() || (bnReg.getLayoutVersion() == storage.getLayoutVersion() && bnReg.getCTime() > storage.getCTime())) // remote node has newer image age msg = "Name node " + bnReg.getAddress() + " has newer image layout version: LV = " +bnReg.getLayoutVersion() + " cTime = " + bnReg.getCTime() + ". Current version: LV = " + storage.getLayoutVersion() + " cTime = " + storage.getCTime(); if(msg != null) { LOG.error(msg); return new NamenodeCommand(NamenodeProtocol.ACT_SHUTDOWN); } boolean needToReturnImg = true; if(storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) // do not return image if there are no image directories needToReturnImg = false; CheckpointSignature sig = rollEditLog(); return new CheckpointCommand(sig, needToReturnImg); } /** * End checkpoint. *

* Validate the current storage info with the given signature. * * @param sig to validate the current storage info against * @throws IOException if the checkpoint fields are inconsistent */ void endCheckpoint(CheckpointSignature sig) throws IOException { LOG.info("End checkpoint at txid " + getEditLog().getLastWrittenTxId()); sig.validateStorageInfo(this); } /** * This is called by the 2NN after having downloaded an image, and by * the NN after having received a new image from the 2NN. It * renames the image from fsimage_N.ckpt to fsimage_N and also * saves the related .md5 file into place. */ public synchronized void saveDigestAndRenameCheckpointImage(NameNodeFile nnf, long txid, MD5Hash digest) throws IOException { // Write and rename MD5 file List badSds = Lists.newArrayList(); for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) { File imageFile = NNStorage.getImageFile(sd, nnf, txid); try { MD5FileUtils.saveMD5File(imageFile, digest); } catch (IOException ioe) { badSds.add(sd); } } storage.reportErrorsOnDirectories(badSds); CheckpointFaultInjector.getInstance().afterMD5Rename(); // Rename image from tmp file renameCheckpoint(txid, NameNodeFile.IMAGE_NEW, nnf, false); // So long as this is the newest image available, // advertise it as such to other checkpointers // from now on if (txid > storage.getMostRecentCheckpointTxId()) { storage.setMostRecentCheckpointInfo(txid, Time.now()); } // Create a version file in any new storage directory. initNewDirs(); } @Override synchronized public void close() throws IOException { if (editLog != null) { // 2NN doesn't have any edit log getEditLog().close(); } storage.close(); } /** * Retrieve checkpoint dirs from configuration. * * @param conf the Configuration * @param defaultValue a default value for the attribute, if null * @return a Collection of URIs representing the values in * dfs.namenode.checkpoint.dir configuration property */ static Collection getCheckpointDirs(Configuration conf, String defaultValue) { Collection dirNames = conf.getTrimmedStringCollection( DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_DIR_KEY); if (dirNames.size() == 0 && defaultValue != null) { dirNames.add(defaultValue); } return Util.stringCollectionAsURIs(dirNames); } static List getCheckpointEditsDirs(Configuration conf, String defaultName) { Collection dirNames = conf.getTrimmedStringCollection( DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_EDITS_DIR_KEY); if (dirNames.size() == 0 && defaultName != null) { dirNames.add(defaultName); } return Util.stringCollectionAsURIs(dirNames); } public NNStorage getStorage() { return storage; } public int getLayoutVersion() { return storage.getLayoutVersion(); } public int getNamespaceID() { return storage.getNamespaceID(); } public String getClusterID() { return storage.getClusterID(); } public String getBlockPoolID() { return storage.getBlockPoolID(); } public synchronized long getLastAppliedTxId() { return lastAppliedTxId; } public long getLastAppliedOrWrittenTxId() { return Math.max(lastAppliedTxId, editLog != null ? editLog.getLastWrittenTxIdWithoutLock() : 0); } /** * This method holds a lock of FSEditLog to get the correct value. * This method must not be used for metrics. */ public long getCorrectLastAppliedOrWrittenTxId() { return Math.max(lastAppliedTxId, editLog != null ? editLog.getLastWrittenTxId() : 0); } public void updateLastAppliedTxIdFromWritten() { this.lastAppliedTxId = editLog.getLastWrittenTxId(); } // Should be OK for this to not be synchronized since all of the places which // mutate this value are themselves synchronized so it shouldn't be possible // to see this flop back and forth. In the worst case this will just return an // old value. public long getMostRecentCheckpointTxId() { return storage.getMostRecentCheckpointTxId(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy