alluxio.master.file.InodeSyncStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of alluxio-core-server-master Show documentation
Alluxio master service
The newest version!
/*
 * The Alluxio Open Foundation licenses this work under the Apache License, version 2.0
 * (the "License"). You may not use this work except in compliance with the License, which is
 * available at www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied, as more fully set forth in the License.
 *
 * See the NOTICE file distributed with this work for information regarding copyright ownership.
 */

package alluxio.master.file;

import alluxio.AlluxioURI;
import alluxio.client.WriteType;
import alluxio.collections.Pair;
import alluxio.conf.Configuration;
import alluxio.conf.PropertyKey;
import alluxio.exception.AccessControlException;
import alluxio.exception.BlockInfoException;
import alluxio.exception.DirectoryNotEmptyException;
import alluxio.exception.FileAlreadyCompletedException;
import alluxio.exception.FileAlreadyExistsException;
import alluxio.exception.FileDoesNotExistException;
import alluxio.exception.InvalidFileSizeException;
import alluxio.exception.InvalidPathException;
import alluxio.exception.status.UnavailableException;
import alluxio.file.options.DescendantType;
import alluxio.grpc.CompleteFilePOptions;
import alluxio.grpc.DeletePOptions;
import alluxio.grpc.FileSystemMasterCommonPOptions;
import alluxio.grpc.GrpcUtils;
import alluxio.grpc.LoadDescendantPType;
import alluxio.grpc.LoadMetadataPOptions;
import alluxio.grpc.SetAttributePOptions;
import alluxio.master.file.contexts.CompleteFileContext;
import alluxio.master.file.contexts.CreateDirectoryContext;
import alluxio.master.file.contexts.CreateFileContext;
import alluxio.master.file.contexts.DeleteContext;
import alluxio.master.file.contexts.LoadMetadataContext;
import alluxio.master.file.contexts.SetAttributeContext;
import alluxio.master.file.meta.Inode;
import alluxio.master.file.meta.InodeFile;
import alluxio.master.file.meta.InodeLockManager;
import alluxio.master.file.meta.InodeTree;
import alluxio.master.file.meta.InodeTree.LockPattern;
import alluxio.master.file.meta.LockedInodePath;
import alluxio.master.file.meta.LockingScheme;
import alluxio.master.file.meta.MountTable;
import alluxio.master.file.meta.MutableInodeFile;
import alluxio.master.file.meta.SyncCheck;
import alluxio.master.file.meta.SyncCheck.SyncResult;
import alluxio.master.file.meta.UfsAbsentPathCache;
import alluxio.master.file.meta.UfsSyncPathCache;
import alluxio.master.file.meta.UfsSyncUtils;
import alluxio.master.journal.FileSystemMergeJournalContext;
import alluxio.master.journal.JournalContext;
import alluxio.master.journal.MergeJournalContext;
import alluxio.master.journal.MetadataSyncMergeJournalContext;
import alluxio.master.journal.NoopJournalContext;
import alluxio.master.metastore.ReadOnlyInodeStore;
import alluxio.proto.journal.File;
import alluxio.proto.journal.Journal;
import alluxio.resource.CloseableIterator;
import alluxio.resource.CloseableResource;
import alluxio.security.authorization.AccessControlList;
import alluxio.security.authorization.DefaultAccessControlList;
import alluxio.security.authorization.Mode;
import alluxio.underfs.Fingerprint;
import alluxio.underfs.UfsFileStatus;
import alluxio.underfs.UfsManager;
import alluxio.underfs.UfsStatus;
import alluxio.underfs.UfsStatusCache;
import alluxio.underfs.UnderFileSystem;
import alluxio.util.LogUtils;
import alluxio.util.interfaces.Scoped;
import alluxio.util.io.PathUtils;

import com.codahale.metrics.Counter;
import com.google.common.base.MoreObjects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.time.Clock;
import java.util.ArrayList;
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Queue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.function.Function;
import javax.annotation.Nullable;

/**
 * This class is responsible for maintaining the logic which surrounds syncing metadata between
 * Alluxio and its UFSes.
 *
 * This implementation uses a BFS-based approach to crawl the inode tree. In order to speed up
 * the sync process we use an {@link ExecutorService} which we submit inode paths to using
 * {@link #processSyncPath(AlluxioURI, RpcContext)}.
 * The processing of inode paths will discover new paths to
 * sync depending on the {@link #mDescendantType}. Syncing is finished when all submitted tasks
 * are completed and there are no new inodes left in the queue.
 *
 * Syncing inode metadata requires making calls to the UFS. This implementation will schedule UFS
 * RPCs with the {@link UfsStatusCache#prefetchChildren(AlluxioURI, MountTable)}. Then, once the
 * inode begins processing, it can retrieve the results. After processing, it can then remove its
 * {@link UfsStatus} from the cache. This strategy helps reduce memory pressure on the master
 * while performing a sync for a large tree. Additionally, by using a prefetch mechanism we can
 * concurrently process other inodes while waiting for UFS RPCs to complete.
 *
 * With regard to locking, this class expects to be able to take a write lock on any inode, and
 * then subsequently downgrades or unlocks after the sync is finished. Even though we use
 * {@link java.util.concurrent.locks.ReentrantReadWriteLock}, because we concurrently process
 * inodes on separate threads, we cannot utilize the reentrant behavior. The implications of
 * that mean the caller of this class must not hold a write while calling {@link #sync()}.
 *
 * A user of this class is expected to create a new instance for each path that they would like
 * to process. This is because the Lock on the {@link #mRootScheme} may be changed after calling
 * {@link #sync()}.
 *
 * When a sync happens on a directory, only the sync timestamp of the directory itself will be
 * updated (including information if the sync was recursive) and not its children.
 * Then whenever a path checked it will also check its parent's sync time up to the root.
 * There are currently two reasons for this, the first is so that the sync cache will be
 * updated only on the parent directory level and not for every child synced meaning there will be
 * fewer entries in the cache. Second that updating the children times individually would require
 * a redesign because the sync paths are not tracked in a way currently where they know
 * when their children finish (apart from the root sync path).
 *
 * When checking if a child of the root sync path needs to be synced, the following
 * two items are considered:
 * 1. If a child directory does not need to be synced, then it will not be synced.
 * The parent will then update its sync time only to the oldest sync time of a child
 * that was not synced (or the current clock time if all children were synced).
 * 2. If a child file does not need to be synced (but its updated state has already
 * been loaded from the UFS due to the listing of the parent directory) then the
 * sync is still performed (because no additional UFS operations are needed,
 * unless ACL is enabled for the UFS, then an additional UFS call would be
 * needed so the sync is skipped and the time is calculated as in 1.).
 *
 * To go through an example (note I am assuming every path here is a directory).
 * If say the interval is 100s, and the last synced timestamps are:
 * /a 0
 * /a/b 10
 * /a/c 0
 * /a/d 0
 * /a/e 0
 * /a/f 0
 * Then the current timestamp is 100 and a sync will trigger with the sync for /a/b skipped.
 * Then the timestamps look like the following:
 * /a 10
 * /a/b 10
 * /a/c 0
 * /a/d 0
 * /a/e 0
 * /a/f 0
 *
 * Now if we do a sync at timestamp 110, a metadata sync for /a will be triggered again,
 * all children are synced. After the operation, the timestamp looks like
 * (i.e. all paths have a sync time of 110):
 * /a 110
 * /a/b 10
 * /a/c 0
 * /a/d 0
 * /a/e 0
 * /a/f 0
 *
 * Here is a second example:
 * If say the interval is 100s, the last synced timestamps are:
 * /a 0
 * /a/b 0
 * /a/c 0
 * /a/d 0
 * /a/e 0
 * /a/f 0
 * Now say at time 90 some children are synced individually.
 * Then the timestamps look like the following:
 * /a 0
 * /a/b 0
 * /a/c 90
 * /a/d 90
 * /a/e 90
 * /a/f 90
 *
 * and if we do a sync at timestamp 100, a sync will only happen on /a/b,
 * and /a will get updated to 90
 * /a 90
 * /a/b 0
 * /a/c 90
 * /a/d 90
 * /a/e 90
 * /a/f 90
 *
 * Note that we may consider different ways of deciding how to sync children
 * (see https://github.com/Alluxio/alluxio/pull/16081).
 */
public class InodeSyncStream {
  /**
   * Return status of a sync result.
   */
  public enum SyncStatus {
    OK,
    FAILED,
    NOT_NEEDED
  }

  private static final Logger LOG = LoggerFactory.getLogger(InodeSyncStream.class);

  private static final FileSystemMasterCommonPOptions NO_TTL_OPTION =
      FileSystemMasterCommonPOptions.newBuilder()
          .setTtl(-1)
          .build();

  /** The root path. Should be locked with a write lock. */
  private final LockingScheme mRootScheme;

  /** A {@link UfsSyncPathCache} maintained from the {@link DefaultFileSystemMaster}. */
  private final UfsSyncPathCache mUfsSyncPathCache;

  /** Object holding the {@link UfsStatus}es which may be required for syncing. */
  private final UfsStatusCache mStatusCache;

  /** Inode tree to lock new paths. */
  private final InodeTree mInodeTree;

  /** Determines how deep in the tree we need to load. */
  private final DescendantType mDescendantType;

  /** The {@link RpcContext} from the caller. */
  private final RpcContext mRpcContext;

  /** The inode store to look up children. */
  private final ReadOnlyInodeStore mInodeStore;

  /** The mount table for looking up the proper UFS client based on the Alluxio path. */
  private final MountTable mMountTable;

  /** The lock manager used to try acquiring the persisting lock. */
  private final InodeLockManager mInodeLockManager;

  /** The FS master creating this object. */
  private final DefaultFileSystemMaster mFsMaster;

  /** Set this to true to force a sync regardless of the UfsPathCache. */
  private final boolean mForceSync;

  /** The sync options on the RPC.  */
  private final FileSystemMasterCommonPOptions mSyncOptions;

  /** To determine if we should use the MergeJournalContext to merge journals. */
  private final boolean mUseFileSystemMergeJournalContext = Configuration.getBoolean(
      PropertyKey.MASTER_FILE_SYSTEM_MERGE_INODE_JOURNALS
  );

  /** To determine whether we should only let the UFS sync happen once
   * for the concurrent metadata sync requests syncing the same directory.
   */
  private final boolean mDedupConcurrentSync = Configuration.getBoolean(
      PropertyKey.MASTER_METADATA_CONCURRENT_SYNC_DEDUP
  );
  private static final MetadataSyncLockManager SYNC_METADATA_LOCK_MANAGER =
      new MetadataSyncLockManager();

  /** Whether to only read+create metadata from the UFS, or to update metadata as well. */
  private final boolean mLoadOnly;

  /** Deque used to keep track of paths that still need to be synced. */
  private final ConcurrentLinkedDeque mPendingPaths;

  /** The traversal order of {@link #mPendingPaths}. */
  private final MetadataSyncTraversalOrder mTraverseType;

  /** Queue of paths that have been submitted to the executor. */
  private final Queue> mSyncPathJobs;

  /** The executor enabling concurrent processing. */
  private final ExecutorService mMetadataSyncService;

  /** The interval of time passed (in ms) to require a new sync. */
  private final long mSyncInterval;

  /** The maximum number of concurrent paths that can be syncing at any moment. */
  private final int mConcurrencyLevel =
      Configuration.getInt(PropertyKey.MASTER_METADATA_SYNC_CONCURRENCY_LEVEL);

  private final boolean mGetDirectoryStatusSkipLoadingChildren =
      Configuration.getBoolean(
          PropertyKey.MASTER_METADATA_SYNC_GET_DIRECTORY_STATUS_SKIP_LOADING_CHILDREN);

  private final FileSystemMasterAuditContext mAuditContext;
  private final Function mAuditContextSrcInodeFunc;

  private final Clock mClock;

  /**
   * Create a new instance of {@link InodeSyncStream}.
   *
   * The root path should be already locked with {@link LockPattern#WRITE_EDGE} unless the user is
   * only planning on loading metadata. The desired pattern should always be
   * {@link LockPattern#READ}.
   *
   * It is an error to initiate sync without a WRITE_EDGE lock when loadOnly is {@code false}.
   * If loadOnly is set to {@code true}, then the root path may have a read lock.
   *
   * @param rootPath The root path to begin syncing
   * @param fsMaster the {@link FileSystemMaster} calling this method
   * @param syncPathCache the {@link UfsSyncPathCache} for the given path
   * @param rpcContext the caller's {@link RpcContext}
   * @param descendantType determines the number of descendant inodes to sync
   * @param options the RPC's {@link FileSystemMasterCommonPOptions}
   * @param auditContext the audit context to use when loading
   * @param auditContextSrcInodeFunc the inode to set as the audit context source
   * @param forceSync whether to sync inode metadata no matter what
   * @param loadOnly whether to only load new metadata, rather than update existing metadata
   * @param loadAlways whether to always load new metadata from the ufs, even if a file or
   *                   directory has been previous found to not exist
   */
  public InodeSyncStream(LockingScheme rootPath, DefaultFileSystemMaster fsMaster,
      UfsSyncPathCache syncPathCache,
      RpcContext rpcContext, DescendantType descendantType, FileSystemMasterCommonPOptions options,
      @Nullable FileSystemMasterAuditContext auditContext,
      @Nullable Function auditContextSrcInodeFunc,
      boolean forceSync, boolean loadOnly, boolean loadAlways)
  {
    mPendingPaths = new ConcurrentLinkedDeque<>();
    mTraverseType = Configuration.getEnum(PropertyKey.MASTER_METADATA_SYNC_TRAVERSAL_ORDER,
        MetadataSyncTraversalOrder.class);
    mDescendantType = descendantType;
    mRpcContext = rpcContext;
    mMetadataSyncService = fsMaster.mSyncMetadataExecutorIns;
    mClock = fsMaster.mClock;
    mForceSync = forceSync;
    mRootScheme = rootPath;
    mSyncOptions = options;
    mLoadOnly = loadOnly;
    mSyncPathJobs = new LinkedList<>();
    mFsMaster = fsMaster;
    mInodeLockManager = fsMaster.getInodeLockManager();
    mInodeStore = fsMaster.getInodeStore();
    mInodeTree = fsMaster.getInodeTree();
    mMountTable = fsMaster.getMountTable();
    mUfsSyncPathCache = syncPathCache;
    mAuditContext = auditContext;
    mAuditContextSrcInodeFunc = auditContextSrcInodeFunc;
    mSyncInterval = options.hasSyncIntervalMs() ? options.getSyncIntervalMs() :
        Configuration.getMs(PropertyKey.USER_FILE_METADATA_SYNC_INTERVAL);
    // If an absent cache entry was more recent than this value, then it is valid for this sync
    long validCacheTime;
    if (loadOnly) {
      if (loadAlways) {
        validCacheTime = UfsAbsentPathCache.NEVER;
      } else {
        validCacheTime = UfsAbsentPathCache.ALWAYS;
      }
    } else {
      validCacheTime = mClock.millis() - mSyncInterval;
    }
    mStatusCache = new UfsStatusCache(fsMaster.mSyncPrefetchExecutorIns,
        fsMaster.getAbsentPathCache(), validCacheTime);
    // Maintain a global counter of active sync streams
    DefaultFileSystemMaster.Metrics.INODE_SYNC_STREAM_COUNT.inc();
  }

  /**
   * Create a new instance of {@link InodeSyncStream} without any audit or permission checks.
   *
   * @param rootScheme The root path to begin syncing
   * @param fsMaster the {@link FileSystemMaster} calling this method
   * @param syncPathCache the {@link UfsSyncPathCache} for this path
   * @param rpcContext the caller's {@link RpcContext}
   * @param descendantType determines the number of descendant inodes to sync
   * @param options the RPC's {@link FileSystemMasterCommonPOptions}
   * @param forceSync whether to sync inode metadata no matter what
   * @param loadOnly whether to only load new metadata, rather than update existing metadata
   * @param loadAlways whether to always load new metadata from the ufs, even if a file or
   *                   directory has been previous found to not exist
   */
  public InodeSyncStream(LockingScheme rootScheme, DefaultFileSystemMaster fsMaster,
      UfsSyncPathCache syncPathCache,
      RpcContext rpcContext, DescendantType descendantType, FileSystemMasterCommonPOptions options,
      boolean forceSync, boolean loadOnly, boolean loadAlways)
  {
    this(rootScheme, fsMaster, syncPathCache, rpcContext, descendantType, options, null, null,
        forceSync, loadOnly, loadAlways);
  }

  /**
   * Sync the metadata according the root path the stream was created with.
   * [WARNING]:
   * To avoid deadlock, please do not obtain any inode path lock before calling this method.
   *
   * @return SyncStatus object
   */
  public SyncStatus sync() throws AccessControlException, InvalidPathException {
    LOG.debug("Running InodeSyncStream on path {}, with status {}, and force sync {}",
        mRootScheme.getPath(), mRootScheme.shouldSync(), mForceSync);
    if (!mRootScheme.shouldSync().isShouldSync() && !mForceSync) {
      DefaultFileSystemMaster.Metrics.INODE_SYNC_STREAM_SKIPPED.inc();
      return SyncStatus.NOT_NEEDED;
    }
    if (!mDedupConcurrentSync) {
      return syncInternal();
    }
    try (MetadataSyncLockManager.MetadataSyncPathList ignored = SYNC_METADATA_LOCK_MANAGER.lockPath(
        mRootScheme.getPath())) {
      mRpcContext.throwIfCancelled();
      return syncInternal();
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  private SyncStatus syncInternal() throws
      AccessControlException, InvalidPathException {
    // The high-level process for the syncing is:
    // 1. Given an Alluxio path, determine if it is not consistent with the corresponding UFS path.
    //     this means the UFS path does not exist, or has metadata which differs from Alluxio
    // 2. If only the metadata changed, update the inode with the new metadata
    // 3. If the path does not exist in the UFS, delete the inode in Alluxio
    // 4. If not deleted, load metadata from the UFS
    // 5. If a recursive sync, add children inodes to sync queue
    int syncPathCount = 0;
    int failedSyncPathCount = 0;
    int skippedSyncPathCount = 0;
    int stopNum = -1; // stop syncing when we've processed this many paths. -1 for infinite
    if (mDedupConcurrentSync && mRootScheme.shouldSync() != SyncCheck.SHOULD_SYNC) {
      /*
       * If a concurrent sync on the same path is successful after this sync had already
       * been initialized and that sync is successful, then there is no need to sync again.
       * This is done by checking is the last successful sync time for the path has
       * increased since this sync was started.
       * * e.g.
       * First assume the last sync time for path /aaa is 0
       * 1. [TS=100] the sync() method is called by thread A for path /aaa with sync
       *    interval 50, so a sync starts
       * 2. [TS=110] the sync() method is called by thread B for path /aaa,
       *    using syncInterval 100, so a sync starts, but
       *    thread B is blocked by the metadata sync lock,
       * 3. [TS=180] thread A finishes the metadata sync, update the SyncPathCache,
       *    setting the last sync timestamp to 100.
       * 4. [TS=182] thread B acquired the lock and can start sync
       * 5. [TS=182] since the sync time for the path was 0 when thread B started,
       *    and is now 100, thread B can skip the sync and return NOT_NEEDED.
       * Note that this still applies if A is to sync recursively path /aaa while B is to
       * sync path /aaa/bbb as the sync scope of A covers B's.
       */
      boolean shouldSkipSync =
          mUfsSyncPathCache.shouldSyncPath(mRootScheme.getPath(), mSyncInterval,
          mDescendantType).getLastSyncTime() > mRootScheme.shouldSync().getLastSyncTime();
      if (shouldSkipSync) {
        DefaultFileSystemMaster.Metrics.INODE_SYNC_STREAM_SKIPPED.inc();
        LOG.debug("Skipped sync on {} due to successful concurrent sync", mRootScheme.getPath());
        return SyncStatus.NOT_NEEDED;
      }
    }
    LOG.debug("Running InodeSyncStream on path {}", mRootScheme.getPath());
    long startTime = mUfsSyncPathCache.recordStartSync();
    boolean rootPathIsFile = false;

    RpcContext rpcContext = getMetadataSyncRpcContext();

    try (LockedInodePath path =
             mInodeTree.lockInodePath(mRootScheme, rpcContext.getJournalContext())) {
      if (mAuditContext != null && mAuditContextSrcInodeFunc != null) {
        mAuditContext.setSrcInode(mAuditContextSrcInodeFunc.apply(path));
      }
      syncInodeMetadata(path, rpcContext);
      syncPathCount++;
      if (mDescendantType == DescendantType.ONE) {
        // If descendantType is ONE, then we shouldn't process any more paths except for those
        // currently in the queue
        stopNum = mPendingPaths.size();
      } else if (mGetDirectoryStatusSkipLoadingChildren && mDescendantType == DescendantType.NONE) {
        // If descendantType is NONE, do not process any path in the queue after
        // the inode itself is loaded.
        stopNum = 0;
      }

      // process the sync result for the original path
      try {
        path.traverse();
        if (path.fullPathExists()) {
          rootPathIsFile = !path.getInode().isDirectory();
        }
      } catch (InvalidPathException e) {
        updateMetrics(false, startTime, syncPathCount, failedSyncPathCount);
        throw new RuntimeException(e);
      }
    } catch (FileDoesNotExistException e) {
      LOG.warn("Failed to sync metadata on root path {} because it"
              + " does not exist on the UFS or in Alluxio", this);
      failedSyncPathCount++;
    } catch (BlockInfoException | FileAlreadyCompletedException
        | InterruptedException | InvalidFileSizeException
        | IOException e) {
      LogUtils.warnWithException(LOG, "Failed to sync metadata on root path {}",
          toString(), e);
      failedSyncPathCount++;
    } catch (InvalidPathException | AccessControlException e) {
      // Catch and re-throw just to update metrics before exit
      LogUtils.warnWithException(LOG, "Failed to sync metadata on root path {}",
          toString(), e);
      updateMetrics(false, startTime, syncPathCount, failedSyncPathCount);
      throw e;
    } finally {
      // regardless of the outcome, remove the UfsStatus for this path from the cache
      mStatusCache.remove(mRootScheme.getPath());
      // add the remaining journals into the async journal writer
      maybeFlushJournalToAsyncJournalWriter(rpcContext);
    }

    // For any children that skip syncing because of a recent sync time,
    // we will only update the root path to the oldest of these times
    Long childOldestSkippedSync = null;
    // Process any children after the root.
    while (!mPendingPaths.isEmpty() || !mSyncPathJobs.isEmpty()) {
      if (Thread.currentThread().isInterrupted()) {
        LOG.warn("Metadata syncing was interrupted before completion; {}", this);
        break;
      }
      if (mRpcContext.isCancelled()) {
        LOG.warn("Metadata syncing was cancelled before completion; {}", this);
        break;
      }
      // There are still paths to process
      // First, remove any futures which have completed. Add to the sync path count if they sync'd
      // successfully
      while (true) {
        Future job = mSyncPathJobs.peek();
        if (job == null || !job.isDone()) {
          break;
        }
        // remove the job because we know it is done.
        if (mSyncPathJobs.poll() != job) {
          updateMetrics(false, startTime, syncPathCount, failedSyncPathCount);
          throw new ConcurrentModificationException("Head of queue modified while executing");
        }
        // Update a global counter
        DefaultFileSystemMaster.Metrics.INODE_SYNC_STREAM_ACTIVE_PATHS_TOTAL.dec();
        try {
          // we synced the path successfully
          // This shouldn't block because we checked job.isDone() earlier
          SyncResult result = job.get();
          if (!result.isResultValid()) {
            failedSyncPathCount++;
          } else if (result.wasSyncPerformed()) {
            syncPathCount++;
          } else {
            skippedSyncPathCount++;
          }
          if (result.isResultValid() && !result.wasSyncPerformed()) {
            childOldestSkippedSync = childOldestSkippedSync == null ? result.getLastSyncTime()
                : Math.min(childOldestSkippedSync, result.getLastSyncTime());
          }
        } catch (InterruptedException | ExecutionException e) {
          failedSyncPathCount++;
          LogUtils.warnWithException(
              LOG, "metadata sync failed while polling for finished paths; {}",
              toString(), e);
          if (e instanceof InterruptedException) {
            Thread.currentThread().interrupt();
            break;
          }
        }
      }

      // When using descendant type of ONE, we need to stop prematurely.
      if (stopNum != -1 && (syncPathCount + failedSyncPathCount + skippedSyncPathCount) > stopNum) {
        break;
      }

      // We can submit up to ( max_concurrency - ) jobs back into the queue
      int submissions = mConcurrencyLevel - mSyncPathJobs.size();
      for (int i = 0; i < submissions; i++) {
        AlluxioURI path = pollItem();
        if (path == null) {
          // no paths left to sync
          break;
        }
        RpcContext rpcContextForSyncPath = getMetadataSyncRpcContext();
        Future job =
            mMetadataSyncService.submit(() -> processSyncPath(path, rpcContextForSyncPath));
        mSyncPathJobs.offer(job);
        // Update global counters for all sync streams
        DefaultFileSystemMaster.Metrics.INODE_SYNC_STREAM_PENDING_PATHS_TOTAL.dec();
        DefaultFileSystemMaster.Metrics.INODE_SYNC_STREAM_ACTIVE_PATHS_TOTAL.inc();
      }
      // After submitting all jobs wait for the job at the head of the queue to finish.
      Future oldestJob = mSyncPathJobs.peek();
      if (oldestJob == null) { // There might not be any jobs, restart the loop.
        continue;
      }
      try {
        oldestJob.get(); // block until the oldest job finished.
      } catch (InterruptedException | ExecutionException e) {
        LogUtils.warnWithException(
                LOG, "Exception while waiting for oldest metadata sync job to finish: {}",
                toString(), e);
        if (e instanceof InterruptedException) {
          Thread.currentThread().interrupt();
        }
      }
    }

    boolean success = syncPathCount > 0;
    if (Configuration.getBoolean(PropertyKey.MASTER_METADATA_SYNC_REPORT_FAILURE)) {
      // There should not be any failed or outstanding jobs
      success = (failedSyncPathCount == 0) && mSyncPathJobs.isEmpty() && mPendingPaths.isEmpty();
    }
    if (success) {
      // update the sync path cache for the root of the sync
      // TODO(gpang): Do we need special handling for failures and thread interrupts?
      mUfsSyncPathCache.notifySyncedPath(mRootScheme.getPath(), mDescendantType,
          startTime, childOldestSkippedSync, rootPathIsFile);
    }
    mStatusCache.cancelAllPrefetch();
    mSyncPathJobs.forEach(f -> f.cancel(true));
    if (!mPendingPaths.isEmpty() || !mSyncPathJobs.isEmpty()) {
      DefaultFileSystemMaster.Metrics.INODE_SYNC_STREAM_SYNC_PATHS_CANCEL.inc(
          mPendingPaths.size() + mSyncPathJobs.size());
    }
    if (!mSyncPathJobs.isEmpty()) {
      DefaultFileSystemMaster.Metrics
          .INODE_SYNC_STREAM_ACTIVE_PATHS_TOTAL.dec(mSyncPathJobs.size());
    }
    if (!mPendingPaths.isEmpty()) {
      DefaultFileSystemMaster.Metrics
          .INODE_SYNC_STREAM_PENDING_PATHS_TOTAL.dec(mPendingPaths.size());
    }

    maybeFlushJournalToAsyncJournalWriter(rpcContext);

    // Update metrics at the end of operation
    updateMetrics(success, startTime, syncPathCount, failedSyncPathCount);
    return success ? SyncStatus.OK : SyncStatus.FAILED;
  }

  private void updateMetrics(boolean success, long startTime,
      int successPathCount, int failedPathCount) {
    long duration = mClock.millis() - startTime;
    DefaultFileSystemMaster.Metrics.INODE_SYNC_STREAM_TIME_MS.inc(duration);
    if (success) {
      DefaultFileSystemMaster.Metrics.INODE_SYNC_STREAM_SUCCESS.inc();
    } else {
      DefaultFileSystemMaster.Metrics.INODE_SYNC_STREAM_FAIL.inc();
    }
    DefaultFileSystemMaster.Metrics.INODE_SYNC_STREAM_SYNC_PATHS_SUCCESS.inc(successPathCount);
    DefaultFileSystemMaster.Metrics.INODE_SYNC_STREAM_SYNC_PATHS_FAIL.inc(failedPathCount);
    if (LOG.isDebugEnabled()) {
      LOG.debug("synced {} paths ({} success, {} failed) in {} ms on {}",
          successPathCount + failedPathCount, successPathCount, failedPathCount,
          duration, mRootScheme);
    }
  }

  /**
   * Process a path to sync.
   *
   * This can update metadata for the inode, delete the inode, and/or queue any children that should
   * be synced as well.
   *
   * @param path The path to sync
   * @return true if this path was synced
   */
  private SyncResult processSyncPath(AlluxioURI path, RpcContext rpcContext)
      throws InvalidPathException {
    try {
      return processSyncPathInternal(path, rpcContext);
    } finally {
      maybeFlushJournalToAsyncJournalWriter(rpcContext);
    }
  }

  private SyncResult processSyncPathInternal(AlluxioURI path, RpcContext rpcContext)
      throws InvalidPathException {
    if (path == null) {
      return SyncResult.INVALID_RESULT;
    }

    // if we have already loaded the path from the UFS, and the path
    // is not a directory and ACL is disabled, then we will always finish the sync
    // (even if it is not needed) since we already have all the data we need
    boolean forceSync = !mFsMaster.isAclEnabled() && mStatusCache.hasStatus(path).map(
        ufsStatus -> !ufsStatus.isDirectory()).orElse(false);

    LockingScheme scheme;
    // forceSync is true means listStatus already prefetched metadata of children,
    // update metadata for such cases
    if (mForceSync || forceSync) {
      scheme = new LockingScheme(path, LockPattern.READ, true);
    } else {
      scheme = new LockingScheme(path, LockPattern.READ, mSyncOptions,
          mUfsSyncPathCache, mDescendantType);
    }

    if (!scheme.shouldSync().isShouldSync() && !mForceSync) {
      return scheme.shouldSync().skippedSync();
    }
    try (LockedInodePath inodePath =
             mInodeTree.tryLockInodePath(scheme, rpcContext.getJournalContext())) {
      if (Thread.currentThread().isInterrupted()) {
        LOG.warn("Thread syncing {} was interrupted before completion", inodePath.getUri());
        return SyncResult.INVALID_RESULT;
      }
      syncInodeMetadata(inodePath, rpcContext);
      return scheme.shouldSync().syncSuccess();
    } catch (AccessControlException | BlockInfoException | FileAlreadyCompletedException
        | FileDoesNotExistException | InterruptedException | InvalidFileSizeException
        | InvalidPathException | IOException e) {
      LogUtils.warnWithException(LOG, "Failed to process sync path: {}", path, e);
    } finally {
      // regardless of the outcome, remove the UfsStatus for this path from the cache
      mStatusCache.remove(path);
    }
    return SyncResult.INVALID_RESULT;
  }

  private void syncInodeMetadata(LockedInodePath inodePath, RpcContext rpcContext)
      throws InvalidPathException, AccessControlException, IOException, FileDoesNotExistException,
      FileAlreadyCompletedException, InvalidFileSizeException, BlockInfoException,
      InterruptedException {
    if (!inodePath.fullPathExists()) {
      loadMetadataForPath(inodePath, rpcContext);
      // skip the load metadata step in the sync if it has been just loaded
      syncExistingInodeMetadata(inodePath, rpcContext, true);
    } else {
      syncExistingInodeMetadata(inodePath, rpcContext, false);
    }
  }

  private Object getFromUfs(Callable