org.apache.hadoop.hbase.regionserver.HRegion Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hbase-server Show documentation
Server functionality for HBase
There is a newer version: 3.0.0-beta-1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.regionserver;

import static org.apache.hadoop.hbase.HConstants.REPLICATION_SCOPE_LOCAL;
import static org.apache.hadoop.hbase.regionserver.HStoreFile.MAJOR_COMPACTION_KEY;
import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent;

import edu.umd.cs.findbugs.annotations.Nullable;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.lang.reflect.Constructor;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.NavigableSet;
import java.util.Objects;
import java.util.Optional;
import java.util.RandomAccess;
import java.util.Set;
import java.util.TreeMap;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.LongAdder;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellBuilderType;
import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.CellComparatorImpl;
import org.apache.hadoop.hbase.CellScanner;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.CompareOperator;
import org.apache.hadoop.hbase.CompoundConfiguration;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.DroppedSnapshotException;
import org.apache.hadoop.hbase.ExtendedCellBuilderFactory;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
import org.apache.hadoop.hbase.HDFSBlocksDistribution;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.MetaCellComparator;
import org.apache.hadoop.hbase.NamespaceDescriptor;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.PrivateCellUtil;
import org.apache.hadoop.hbase.RegionTooBusyException;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.Tag;
import org.apache.hadoop.hbase.TagUtil;
import org.apache.hadoop.hbase.UnknownScannerException;
import org.apache.hadoop.hbase.client.Append;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.CompactionState;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Increment;
import org.apache.hadoop.hbase.client.IsolationLevel;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.PackagePrivateFieldAccessor;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Row;
import org.apache.hadoop.hbase.client.RowMutations;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.conf.ConfigurationManager;
import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver;
import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
import org.apache.hadoop.hbase.coprocessor.ReadOnlyConfiguration;
import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
import org.apache.hadoop.hbase.exceptions.TimeoutIOException;
import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
import org.apache.hadoop.hbase.filter.ByteArrayComparable;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.FilterWrapper;
import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
import org.apache.hadoop.hbase.io.HFileLink;
import org.apache.hadoop.hbase.io.HeapSize;
import org.apache.hadoop.hbase.io.TimeRange;
import org.apache.hadoop.hbase.io.hfile.BlockCache;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.ipc.CallerDisconnectedException;
import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils;
import org.apache.hadoop.hbase.ipc.RpcCall;
import org.apache.hadoop.hbase.ipc.RpcServer;
import org.apache.hadoop.hbase.mob.MobFileCache;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.quotas.RegionServerSpaceQuotaManager;
import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl.WriteEntry;
import org.apache.hadoop.hbase.regionserver.ScannerContext.LimitScope;
import org.apache.hadoop.hbase.regionserver.ScannerContext.NextState;
import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
import org.apache.hadoop.hbase.regionserver.compactions.CompactionLifeCycleTracker;
import org.apache.hadoop.hbase.regionserver.throttle.CompactionThroughputControllerFactory;
import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController;
import org.apache.hadoop.hbase.regionserver.throttle.StoreHotnessProtector;
import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController;
import org.apache.hadoop.hbase.regionserver.wal.WALUtil;
import org.apache.hadoop.hbase.replication.ReplicationUtils;
import org.apache.hadoop.hbase.replication.regionserver.ReplicationObserver;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CancelableProgressable;
import org.apache.hadoop.hbase.util.ClassSize;
import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.HashedBytes;
import org.apache.hadoop.hbase.util.NonceKey;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
import org.apache.hadoop.hbase.util.TableDescriptorChecker;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.wal.WAL;
import org.apache.hadoop.hbase.wal.WALEdit;
import org.apache.hadoop.hbase.wal.WALFactory;
import org.apache.hadoop.hbase.wal.WALKey;
import org.apache.hadoop.hbase.wal.WALKeyImpl;
import org.apache.hadoop.hbase.wal.WALSplitUtil;
import org.apache.hadoop.hbase.wal.WALSplitUtil.MutationReplay;
import org.apache.hadoop.util.StringUtils;
import org.apache.htrace.core.TraceScope;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
import org.apache.hbase.thirdparty.com.google.protobuf.Service;
import org.apache.hbase.thirdparty.com.google.protobuf.TextFormat;
import org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations;
import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils;

import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.CoprocessorServiceCall;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.RegionLoad;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.CompactionDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.FlushAction;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor.EventType;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.StoreDescriptor;

/**
 * Regions store data for a certain region of a table.  It stores all columns
 * for each row. A given table consists of one or more Regions.
 *
 * An Region is defined by its table and its key extent.
 *
 * 
Locking at the Region level serves only one purpose: preventing the
 * region from being closed (and consequently split) while other operations
 * are ongoing. Each row level operation obtains both a row lock and a region
 * read lock for the duration of the operation. While a scanner is being
 * constructed, getScanner holds a read lock. If the scanner is successfully
 * constructed, it holds a read lock until it is closed. A close takes out a
 * write lock and consequently will block for ongoing operations and will block
 * new operations from starting while the close is in progress.
 */
@SuppressWarnings("deprecation")
@InterfaceAudience.Private
public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region {
  private static final Logger LOG = LoggerFactory.getLogger(HRegion.class);

  public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY =
    "hbase.hregion.scan.loadColumnFamiliesOnDemand";

  public static final String HBASE_MAX_CELL_SIZE_KEY = "hbase.server.keyvalue.maxsize";
  public static final int DEFAULT_MAX_CELL_SIZE = 10485760;

  public static final String HBASE_REGIONSERVER_MINIBATCH_SIZE =
      "hbase.regionserver.minibatch.size";
  public static final int DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE = 20000;

  public static final String WAL_HSYNC_CONF_KEY = "hbase.wal.hsync";
  public static final boolean DEFAULT_WAL_HSYNC = false;

  /**
   * This is for for using HRegion as a local storage, where we may put the recovered edits in a
   * special place. Once this is set, we will only replay the recovered edits under this directory
   * and ignore the original replay directory configs.
   */
  public static final String SPECIAL_RECOVERED_EDITS_DIR =
    "hbase.hregion.special.recovered.edits.dir";

  /**
   * Whether to use {@link MetaCellComparator} even if we are not meta region. Used when creating
   * master local region.
   */
  public static final String USE_META_CELL_COMPARATOR = "hbase.region.use.meta.cell.comparator";

  public static final boolean DEFAULT_USE_META_CELL_COMPARATOR = false;

  final AtomicBoolean closed = new AtomicBoolean(false);

  /* Closing can take some time; use the closing flag if there is stuff we don't
   * want to do while in closing state; e.g. like offer this region up to the
   * master as a region to close if the carrying regionserver is overloaded.
   * Once set, it is never cleared.
   */
  final AtomicBoolean closing = new AtomicBoolean(false);

  /**
   * The max sequence id of flushed data on this region. There is no edit in memory that is
   * less that this sequence id.
   */
  private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM;

  /**
   * Record the sequence id of last flush operation. Can be in advance of
   * {@link #maxFlushedSeqId} when flushing a single column family. In this case,
   * {@link #maxFlushedSeqId} will be older than the oldest edit in memory.
   */
  private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM;

  /**
   * The sequence id of the last replayed open region event from the primary region. This is used
   * to skip entries before this due to the possibility of replay edits coming out of order from
   * replication.
   */
  protected volatile long lastReplayedOpenRegionSeqId = -1L;
  protected volatile long lastReplayedCompactionSeqId = -1L;

  //////////////////////////////////////////////////////////////////////////////
  // Members
  //////////////////////////////////////////////////////////////////////////////

  // map from a locked row to the context for that lock including:
  // - CountDownLatch for threads waiting on that row
  // - the thread that owns the lock (allow reentrancy)
  // - reference count of (reentrant) locks held by the thread
  // - the row itself
  private final ConcurrentHashMap lockedRows =
      new ConcurrentHashMap<>();

  protected final Map stores =
      new ConcurrentSkipListMap<>(Bytes.BYTES_RAWCOMPARATOR);

  // TODO: account for each registered handler in HeapSize computation
  private Map coprocessorServiceHandlers = Maps.newHashMap();

  // Track data size in all memstores
  private final MemStoreSizing memStoreSizing = new ThreadSafeMemStoreSizing();
  @VisibleForTesting
  RegionServicesForStores regionServicesForStores;

  // Debug possible data loss due to WAL off
  final LongAdder numMutationsWithoutWAL = new LongAdder();
  final LongAdder dataInMemoryWithoutWAL = new LongAdder();

  // Debug why CAS operations are taking a while.
  final LongAdder checkAndMutateChecksPassed = new LongAdder();
  final LongAdder checkAndMutateChecksFailed = new LongAdder();

  // Number of requests
  // Count rows for scan
  final LongAdder readRequestsCount = new LongAdder();
  final LongAdder filteredReadRequestsCount = new LongAdder();
  // Count rows for multi row mutations
  final LongAdder writeRequestsCount = new LongAdder();

  // Number of requests blocked by memstore size.
  private final LongAdder blockedRequestsCount = new LongAdder();

  // Compaction LongAdders
  final LongAdder compactionsFinished = new LongAdder();
  final LongAdder compactionsFailed = new LongAdder();
  final LongAdder compactionNumFilesCompacted = new LongAdder();
  final LongAdder compactionNumBytesCompacted = new LongAdder();
  final LongAdder compactionsQueued = new LongAdder();
  final LongAdder flushesQueued = new LongAdder();

  private BlockCache blockCache;
  private MobFileCache mobFileCache;
  private final WAL wal;
  private final HRegionFileSystem fs;
  protected final Configuration conf;
  private final Configuration baseConf;
  private final int rowLockWaitDuration;
  static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;

  private Path regionDir;
  private FileSystem walFS;

  // set to true if the region is restored from snapshot
  private boolean isRestoredRegion = false;

  public void setRestoredRegion(boolean restoredRegion) {
    isRestoredRegion = restoredRegion;
  }

  // The internal wait duration to acquire a lock before read/update
  // from the region. It is not per row. The purpose of this wait time
  // is to avoid waiting a long time while the region is busy, so that
  // we can release the IPC handler soon enough to improve the
  // availability of the region server. It can be adjusted by
  // tuning configuration "hbase.busy.wait.duration".
  final long busyWaitDuration;
  static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;

  // If updating multiple rows in one call, wait longer,
  // i.e. waiting for busyWaitDuration * # of rows. However,
  // we can limit the max multiplier.
  final int maxBusyWaitMultiplier;

  // Max busy wait duration. There is no point to wait longer than the RPC
  // purge timeout, when a RPC call will be terminated by the RPC engine.
  final long maxBusyWaitDuration;

  // Max cell size. If nonzero, the maximum allowed size for any given cell
  // in bytes
  final long maxCellSize;

  // Number of mutations for minibatch processing.
  private final int miniBatchSize;

  // negative number indicates infinite timeout
  static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L;
  final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool();

  private final ConcurrentHashMap scannerReadPoints;

  /**
   * The sequence ID that was enLongAddered when this region was opened.
   */
  private long openSeqNum = HConstants.NO_SEQNUM;

  /**
   * The default setting for whether to enable on-demand CF loading for
   * scan requests to this region. Requests can override it.
   */
  private boolean isLoadingCfsOnDemandDefault = false;

  private final AtomicInteger majorInProgress = new AtomicInteger(0);
  private final AtomicInteger minorInProgress = new AtomicInteger(0);

  //
  // Context: During replay we want to ensure that we do not lose any data. So, we
  // have to be conservative in how we replay wals. For each store, we calculate
  // the maxSeqId up to which the store was flushed. And, skip the edits which
  // are equal to or lower than maxSeqId for each store.
  // The following map is populated when opening the region
  Map maxSeqIdInStores = new TreeMap<>(Bytes.BYTES_COMPARATOR);

  /** Saved state from replaying prepare flush cache */
  private PrepareFlushResult prepareFlushResult = null;

  private volatile ConfigurationManager configurationManager;

  // Used for testing.
  private volatile Long timeoutForWriteLock = null;

  private final CellComparator cellComparator;

  /**
   * @return The smallest mvcc readPoint across all the scanners in this
   * region. Writes older than this readPoint, are included in every
   * read operation.
   */
  public long getSmallestReadPoint() {
    long minimumReadPoint;
    // We need to ensure that while we are calculating the smallestReadPoint
    // no new RegionScanners can grab a readPoint that we are unaware of.
    // We achieve this by synchronizing on the scannerReadPoints object.
    synchronized (scannerReadPoints) {
      minimumReadPoint = mvcc.getReadPoint();
      for (Long readPoint : this.scannerReadPoints.values()) {
        if (readPoint < minimumReadPoint) {
          minimumReadPoint = readPoint;
        }
      }
    }
    return minimumReadPoint;
  }

  /*
   * Data structure of write state flags used coordinating flushes,
   * compactions and closes.
   */
  static class WriteState {
    // Set while a memstore flush is happening.
    volatile boolean flushing = false;
    // Set when a flush has been requested.
    volatile boolean flushRequested = false;
    // Number of compactions running.
    AtomicInteger compacting = new AtomicInteger(0);
    // Gets set in close. If set, cannot compact or flush again.
    volatile boolean writesEnabled = true;
    // Set if region is read-only
    volatile boolean readOnly = false;
    // whether the reads are enabled. This is different than readOnly, because readOnly is
    // static in the lifetime of the region, while readsEnabled is dynamic
    volatile boolean readsEnabled = true;

    /**
     * Set flags that make this region read-only.
     *
     * @param onOff flip value for region r/o setting
     */
    synchronized void setReadOnly(final boolean onOff) {
      this.writesEnabled = !onOff;
      this.readOnly = onOff;
    }

    boolean isReadOnly() {
      return this.readOnly;
    }

    boolean isFlushRequested() {
      return this.flushRequested;
    }

    void setReadsEnabled(boolean readsEnabled) {
      this.readsEnabled = readsEnabled;
    }

    static final long HEAP_SIZE = ClassSize.align(
        ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN);
  }

  /**
   * Objects from this class are created when flushing to describe all the different states that
   * that method ends up in. The Result enum describes those states. The sequence id should only
   * be specified if the flush was successful, and the failure message should only be specified
   * if it didn't flush.
   */
  public static class FlushResultImpl implements FlushResult {
    final Result result;
    final String failureReason;
    final long flushSequenceId;
    final boolean wroteFlushWalMarker;

    /**
     * Convenience constructor to use when the flush is successful, the failure message is set to
     * null.
     * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED.
     * @param flushSequenceId Generated sequence id that comes right after the edits in the
     *                        memstores.
     */
    FlushResultImpl(Result result, long flushSequenceId) {
      this(result, flushSequenceId, null, false);
      assert result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
          .FLUSHED_COMPACTION_NEEDED;
    }

    /**
     * Convenience constructor to use when we cannot flush.
     * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH.
     * @param failureReason Reason why we couldn't flush.
     */
    FlushResultImpl(Result result, String failureReason, boolean wroteFlushMarker) {
      this(result, -1, failureReason, wroteFlushMarker);
      assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH;
    }

    /**
     * Constructor with all the parameters.
     * @param result Any of the Result.
     * @param flushSequenceId Generated sequence id if the memstores were flushed else -1.
     * @param failureReason Reason why we couldn't flush, or null.
     */
    FlushResultImpl(Result result, long flushSequenceId, String failureReason,
      boolean wroteFlushMarker) {
      this.result = result;
      this.flushSequenceId = flushSequenceId;
      this.failureReason = failureReason;
      this.wroteFlushWalMarker = wroteFlushMarker;
    }

    /**
     * Convenience method, the equivalent of checking if result is
     * FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_NO_COMPACTION_NEEDED.
     * @return true if the memstores were flushed, else false.
     */
    @Override
    public boolean isFlushSucceeded() {
      return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
          .FLUSHED_COMPACTION_NEEDED;
    }

    /**
     * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED.
     * @return True if the flush requested a compaction, else false (doesn't even mean it flushed).
     */
    @Override
    public boolean isCompactionNeeded() {
      return result == Result.FLUSHED_COMPACTION_NEEDED;
    }

    @Override
    public String toString() {
      return new StringBuilder()
        .append("flush result:").append(result).append(", ")
        .append("failureReason:").append(failureReason).append(",")
        .append("flush seq id").append(flushSequenceId).toString();
    }

    @Override
    public Result getResult() {
      return result;
    }
  }

  /** A result object from prepare flush cache stage */
  @VisibleForTesting
  static class PrepareFlushResult {
    final FlushResultImpl result; // indicating a failure result from prepare
    final TreeMap storeFlushCtxs;
    final TreeMap> committedFiles;
    final TreeMap storeFlushableSize;
    final long startTime;
    final long flushOpSeqId;
    final long flushedSeqId;
    final MemStoreSizing totalFlushableSize;

    /** Constructs an early exit case */
    PrepareFlushResult(FlushResultImpl result, long flushSeqId) {
      this(result, null, null, null, Math.max(0, flushSeqId), 0, 0, MemStoreSizing.DUD);
    }

    /** Constructs a successful prepare flush result */
    PrepareFlushResult(
      TreeMap storeFlushCtxs,
      TreeMap> committedFiles,
      TreeMap storeFlushableSize, long startTime, long flushSeqId,
      long flushedSeqId, MemStoreSizing totalFlushableSize) {
      this(null, storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
        flushSeqId, flushedSeqId, totalFlushableSize);
    }

    private PrepareFlushResult(
        FlushResultImpl result,
      TreeMap storeFlushCtxs,
      TreeMap> committedFiles,
      TreeMap storeFlushableSize, long startTime, long flushSeqId,
      long flushedSeqId, MemStoreSizing totalFlushableSize) {
      this.result = result;
      this.storeFlushCtxs = storeFlushCtxs;
      this.committedFiles = committedFiles;
      this.storeFlushableSize = storeFlushableSize;
      this.startTime = startTime;
      this.flushOpSeqId = flushSeqId;
      this.flushedSeqId = flushedSeqId;
      this.totalFlushableSize = totalFlushableSize;
    }

    public FlushResult getResult() {
      return this.result;
    }
  }

  /**
   * A class that tracks exceptions that have been observed in one batch. Not thread safe.
   */
  static class ObservedExceptionsInBatch {
    private boolean wrongRegion = false;
    private boolean failedSanityCheck = false;
    private boolean wrongFamily = false;

    /**
     * @return If a {@link WrongRegionException} has been observed.
     */
    boolean hasSeenWrongRegion() {
      return wrongRegion;
    }

    /**
     * Records that a {@link WrongRegionException} has been observed.
     */
    void sawWrongRegion() {
      wrongRegion = true;
    }

    /**
     * @return If a {@link FailedSanityCheckException} has been observed.
     */
    boolean hasSeenFailedSanityCheck() {
      return failedSanityCheck;
    }

    /**
     * Records that a {@link FailedSanityCheckException} has been observed.
     */
    void sawFailedSanityCheck() {
      failedSanityCheck = true;
    }

    /**
     * @return If a {@link NoSuchColumnFamilyException} has been observed.
     */
    boolean hasSeenNoSuchFamily() {
      return wrongFamily;
    }

    /**
     * Records that a {@link NoSuchColumnFamilyException} has been observed.
     */
    void sawNoSuchFamily() {
      wrongFamily = true;
    }
  }

  final WriteState writestate = new WriteState();

  long memstoreFlushSize;
  final long timestampSlop;
  final long rowProcessorTimeout;

  // Last flush time for each Store. Useful when we are flushing for each column
  private final ConcurrentMap lastStoreFlushTimeMap = new ConcurrentHashMap<>();

  final RegionServerServices rsServices;
  private RegionServerAccounting rsAccounting;
  private long flushCheckInterval;
  // flushPerChanges is to prevent too many changes in memstore
  private long flushPerChanges;
  private long blockingMemStoreSize;
  // Used to guard closes
  final ReentrantReadWriteLock lock;

  // Stop updates lock
  private final ReentrantReadWriteLock updatesLock = new ReentrantReadWriteLock();
  private boolean splitRequest;
  private byte[] explicitSplitPoint = null;

  private final MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();

  // Coprocessor host
  private RegionCoprocessorHost coprocessorHost;

  private TableDescriptor htableDescriptor = null;
  private RegionSplitPolicy splitPolicy;
  private FlushPolicy flushPolicy;

  private final MetricsRegion metricsRegion;
  private final MetricsRegionWrapperImpl metricsRegionWrapper;
  private final Durability regionDurability;
  private final boolean regionStatsEnabled;
  // Stores the replication scope of the various column families of the table
  // that has non-default scope
  private final NavigableMap replicationScope = new TreeMap<>(
      Bytes.BYTES_COMPARATOR);

  private final StoreHotnessProtector storeHotnessProtector;

  /**
   * HRegion constructor. This constructor should only be used for testing and
   * extensions.  Instances of HRegion should be instantiated with the
   * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
   *
   * @param tableDir qualified path of directory where region should be located,
   * usually the table directory.
   * @param wal The WAL is the outbound log for any updates to the HRegion
   * The wal file is a logfile from the previous execution that's
   * custom-computed for this HRegion. The HRegionServer computes and sorts the
   * appropriate wal info for this HRegion. If there is a previous wal file
   * (implying that the HRegion has been written-to before), then read it from
   * the supplied path.
   * @param fs is the filesystem.
   * @param confParam is global configuration settings.
   * @param regionInfo - RegionInfo that describes the region
   * is new), then read them from the supplied path.
   * @param htd the table descriptor
   * @param rsServices reference to {@link RegionServerServices} or null
   * @deprecated Use other constructors.
   */
  @Deprecated
  @VisibleForTesting
  public HRegion(final Path tableDir, final WAL wal, final FileSystem fs,
      final Configuration confParam, final RegionInfo regionInfo,
      final TableDescriptor htd, final RegionServerServices rsServices) {
    this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo),
      wal, confParam, htd, rsServices);
  }

  /**
   * HRegion constructor. This constructor should only be used for testing and
   * extensions.  Instances of HRegion should be instantiated with the
   * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
   *
   * @param fs is the filesystem.
   * @param wal The WAL is the outbound log for any updates to the HRegion
   * The wal file is a logfile from the previous execution that's
   * custom-computed for this HRegion. The HRegionServer computes and sorts the
   * appropriate wal info for this HRegion. If there is a previous wal file
   * (implying that the HRegion has been written-to before), then read it from
   * the supplied path.
   * @param confParam is global configuration settings.
   * @param htd the table descriptor
   * @param rsServices reference to {@link RegionServerServices} or null
   */
  public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam,
      final TableDescriptor htd, final RegionServerServices rsServices) {
    if (htd == null) {
      throw new IllegalArgumentException("Need table descriptor");
    }

    if (confParam instanceof CompoundConfiguration) {
      throw new IllegalArgumentException("Need original base configuration");
    }

    this.wal = wal;
    this.fs = fs;

    // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor
    this.baseConf = confParam;
    this.conf = new CompoundConfiguration().add(confParam).addBytesMap(htd.getValues());
    this.cellComparator = htd.isMetaTable() ||
      conf.getBoolean(USE_META_CELL_COMPARATOR, DEFAULT_USE_META_CELL_COMPARATOR) ?
        MetaCellComparator.META_COMPARATOR : CellComparatorImpl.COMPARATOR;
    this.lock = new ReentrantReadWriteLock(conf.getBoolean(FAIR_REENTRANT_CLOSE_LOCK,
        DEFAULT_FAIR_REENTRANT_CLOSE_LOCK));
    this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL,
        DEFAULT_CACHE_FLUSH_INTERVAL);
    this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES);
    if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) {
      throw new IllegalArgumentException(MEMSTORE_FLUSH_PER_CHANGES + " can not exceed "
          + MAX_FLUSH_PER_CHANGES);
    }
    int tmpRowLockDuration = conf.getInt("hbase.rowlock.wait.duration",
        DEFAULT_ROWLOCK_WAIT_DURATION);
    if (tmpRowLockDuration <= 0) {
      LOG.info("Found hbase.rowlock.wait.duration set to {}. values <= 0 will cause all row " +
          "locking to fail. Treating it as 1ms to avoid region failure.", tmpRowLockDuration);
      tmpRowLockDuration = 1;
    }
    this.rowLockWaitDuration = tmpRowLockDuration;

    this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true);
    this.htableDescriptor = htd;
    Set families = this.htableDescriptor.getColumnFamilyNames();
    for (byte[] family : families) {
      if (!replicationScope.containsKey(family)) {
        int scope = htd.getColumnFamily(family).getScope();
        // Only store those families that has NON-DEFAULT scope
        if (scope != REPLICATION_SCOPE_LOCAL) {
          // Do a copy before storing it here.
          replicationScope.put(Bytes.copy(family), scope);
        }
      }
    }

    this.rsServices = rsServices;
    if (rsServices != null) {
      this.blockCache = rsServices.getBlockCache().orElse(null);
      this.mobFileCache = rsServices.getMobFileCache().orElse(null);
    }
    this.regionServicesForStores = new RegionServicesForStores(this, rsServices);

    setHTableSpecificConf();
    this.scannerReadPoints = new ConcurrentHashMap<>();

    this.busyWaitDuration = conf.getLong(
      "hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION);
    this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2);
    if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) {
      throw new IllegalArgumentException("Invalid hbase.busy.wait.duration ("
        + busyWaitDuration + ") or hbase.busy.wait.multiplier.max ("
        + maxBusyWaitMultiplier + "). Their product should be positive");
    }
    this.maxBusyWaitDuration = conf.getLong("hbase.ipc.client.call.purge.timeout",
      2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT);

    /*
     * timestamp.slop provides a server-side constraint on the timestamp. This
     * assumes that you base your TS around currentTimeMillis(). In this case,
     * throw an error to the user if the user-specified TS is newer than now +
     * slop. LATEST_TIMESTAMP == don't use this functionality
     */
    this.timestampSlop = conf.getLong(
        "hbase.hregion.keyvalue.timestamp.slop.millisecs",
        HConstants.LATEST_TIMESTAMP);

    /**
     * Timeout for the process time in processRowsWithLocks().
     * Use -1 to switch off time bound.
     */
    this.rowProcessorTimeout = conf.getLong(
        "hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT);

    this.storeHotnessProtector = new StoreHotnessProtector(this, conf);

    boolean forceSync = conf.getBoolean(WAL_HSYNC_CONF_KEY, DEFAULT_WAL_HSYNC);
    /**
     * This is the global default value for durability. All tables/mutations not defining a
     * durability or using USE_DEFAULT will default to this value.
     */
    Durability defaultDurability = forceSync ? Durability.FSYNC_WAL : Durability.SYNC_WAL;
    this.regionDurability =
        this.htableDescriptor.getDurability() == Durability.USE_DEFAULT ? defaultDurability :
          this.htableDescriptor.getDurability();

    decorateRegionConfiguration(conf);
    if (rsServices != null) {
      this.rsAccounting = this.rsServices.getRegionServerAccounting();
      // don't initialize coprocessors if not running within a regionserver
      // TODO: revisit if coprocessors should load in other cases
      this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
      this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this);
      this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper, conf);
    } else {
      this.metricsRegionWrapper = null;
      this.metricsRegion = null;
    }
    if (LOG.isDebugEnabled()) {
      // Write out region name, its encoded name and storeHotnessProtector as string.
      LOG.debug("Instantiated " + this +"; "+ storeHotnessProtector.toString());
    }

    configurationManager = null;

    // disable stats tracking system tables, but check the config for everything else
    this.regionStatsEnabled = htd.getTableName().getNamespaceAsString().equals(
        NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) ?
          false :
          conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE,
              HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE);

    this.maxCellSize = conf.getLong(HBASE_MAX_CELL_SIZE_KEY, DEFAULT_MAX_CELL_SIZE);
    this.miniBatchSize = conf.getInt(HBASE_REGIONSERVER_MINIBATCH_SIZE,
        DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE);

    // recover the metrics of read and write requests count if they were retained
    if (rsServices != null && rsServices.getRegionServerAccounting() != null) {
      Pair retainedRWRequestsCnt = rsServices.getRegionServerAccounting()
        .getRetainedRegionRWRequestsCnt().get(getRegionInfo().getEncodedName());
      if (retainedRWRequestsCnt != null) {
        this.setReadRequestsCount(retainedRWRequestsCnt.getFirst());
        this.setWriteRequestsCount(retainedRWRequestsCnt.getSecond());
        // remove them since won't use again
        rsServices.getRegionServerAccounting().getRetainedRegionRWRequestsCnt()
          .remove(getRegionInfo().getEncodedName());
      }
    }
  }

  void setHTableSpecificConf() {
    if (this.htableDescriptor == null) return;
    long flushSize = this.htableDescriptor.getMemStoreFlushSize();

    if (flushSize <= 0) {
      flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
          TableDescriptorBuilder.DEFAULT_MEMSTORE_FLUSH_SIZE);
    }
    this.memstoreFlushSize = flushSize;
    long mult = conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER,
        HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER);
    this.blockingMemStoreSize = this.memstoreFlushSize * mult;
  }

  /**
   * Initialize this region.
   * Used only by tests and SplitTransaction to reopen the region.
   * You should use createHRegion() or openHRegion()
   * @return What the next sequence (edit) id should be.
   * @throws IOException e
   * @deprecated use HRegion.createHRegion() or HRegion.openHRegion()
   */
  @Deprecated
  public long initialize() throws IOException {
    return initialize(null);
  }

  /**
   * Initialize this region.
   *
   * @param reporter Tickle every so often if initialize is taking a while.
   * @return What the next sequence (edit) id should be.
   * @throws IOException e
   */
  @VisibleForTesting
  long initialize(final CancelableProgressable reporter) throws IOException {

    //Refuse to open the region if there is no column family in the table
    if (htableDescriptor.getColumnFamilyCount() == 0) {
      throw new DoNotRetryIOException("Table " + htableDescriptor.getTableName().getNameAsString()+
          " should have at least one column family.");
    }

    MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
    status.enableStatusJournal(true);
    long nextSeqId = -1;
    try {
      nextSeqId = initializeRegionInternals(reporter, status);
      return nextSeqId;
    } catch (IOException e) {
      LOG.warn("Failed initialize of region= {}, starting to roll back memstore",
          getRegionInfo().getRegionNameAsString(), e);
      // global memstore size will be decreased when dropping memstore
      try {
        //drop the memory used by memstore if open region fails
        dropMemStoreContents();
      } catch (IOException ioE) {
        if (conf.getBoolean(MemStoreLAB.USEMSLAB_KEY, MemStoreLAB.USEMSLAB_DEFAULT)) {
          LOG.warn("Failed drop memstore of region= {}, "
                  + "some chunks may not released forever since MSLAB is enabled",
              getRegionInfo().getRegionNameAsString());
        }

      }
      throw e;
    } finally {
      // nextSeqid will be -1 if the initialization fails.
      // At least it will be 0 otherwise.
      if (nextSeqId == -1) {
        status.abort("Exception during region " + getRegionInfo().getRegionNameAsString() +
          " initialization.");
      }
      if (LOG.isDebugEnabled()) {
        LOG.debug("Region open journal for {}:\n{}", this.getRegionInfo().getEncodedName(),
          status.prettyPrintJournal());
      }
      status.cleanup();
    }
  }

  private long initializeRegionInternals(final CancelableProgressable reporter,
      final MonitoredTask status) throws IOException {
    if (coprocessorHost != null) {
      status.setStatus("Running coprocessor pre-open hook");
      coprocessorHost.preOpen();
    }

    // Write HRI to a file in case we need to recover hbase:meta
    // Only the primary replica should write .regioninfo
    if (this.getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID) {
      status.setStatus("Writing region info on filesystem");
      fs.checkRegionInfoOnFilesystem();
    }

    // Initialize all the HStores
    status.setStatus("Initializing all the Stores");
    long maxSeqId = initializeStores(reporter, status);
    this.mvcc.advanceTo(maxSeqId);
    if (!isRestoredRegion && ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this)) {
      Collection stores = this.stores.values();
      try {
        // update the stores that we are replaying
        stores.forEach(HStore::startReplayingFromWAL);
        // Recover any edits if available.
        maxSeqId = Math.max(maxSeqId,
          replayRecoveredEditsIfAny(maxSeqIdInStores, reporter, status));
        // Recover any hfiles if available
        maxSeqId = Math.max(maxSeqId, loadRecoveredHFilesIfAny(stores));
        // Make sure mvcc is up to max.
        this.mvcc.advanceTo(maxSeqId);
      } finally {
        // update the stores that we are done replaying
        stores.forEach(HStore::stopReplayingFromWAL);
      }
    }
    this.lastReplayedOpenRegionSeqId = maxSeqId;

    this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this));
    this.writestate.flushRequested = false;
    this.writestate.compacting.set(0);

    if (this.writestate.writesEnabled) {
      // Remove temporary data left over from old regions
      status.setStatus("Cleaning up temporary data from old regions");
      fs.cleanupTempDir();
    }

    if (this.writestate.writesEnabled) {
      status.setStatus("Cleaning up detritus from prior splits");
      // Get rid of any splits or merges that were lost in-progress.  Clean out
      // these directories here on open.  We may be opening a region that was
      // being split but we crashed in the middle of it all.
      fs.cleanupAnySplitDetritus();
      fs.cleanupMergesDir();
    }

    // Initialize split policy
    this.splitPolicy = RegionSplitPolicy.create(this, conf);

    // Initialize flush policy
    this.flushPolicy = FlushPolicyFactory.create(this, conf);

    long lastFlushTime = EnvironmentEdgeManager.currentTime();
    for (HStore store: stores.values()) {
      this.lastStoreFlushTimeMap.put(store, lastFlushTime);
    }

    // Use maximum of log sequenceid or that which was found in stores
    // (particularly if no recovered edits, seqid will be -1).
    long nextSeqId = maxSeqId + 1;
    if (!isRestoredRegion) {
      // always get openSeqNum from the default replica, even if we are secondary replicas
      long maxSeqIdFromFile = WALSplitUtil.getMaxRegionSequenceId(conf,
        RegionReplicaUtil.getRegionInfoForDefaultReplica(getRegionInfo()), this::getFilesystem,
        this::getWalFileSystem);
      nextSeqId = Math.max(maxSeqId, maxSeqIdFromFile) + 1;
      // The openSeqNum will always be increase even for read only region, as we rely on it to
      // determine whether a region has been successfully reopened, so here we always need to update
      // the max sequence id file.
      if (RegionReplicaUtil.isDefaultReplica(getRegionInfo())) {
        LOG.debug("writing seq id for {}", this.getRegionInfo().getEncodedName());
        WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(),
          nextSeqId - 1);
        // This means we have replayed all the recovered edits and also written out the max sequence
        // id file, let's delete the wrong directories introduced in HBASE-20734, see HBASE-22617
        // for more details.
        Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf,
          getRegionInfo().getTable(), getRegionInfo().getEncodedName());
        FileSystem walFs = getWalFileSystem();
        if (walFs.exists(wrongRegionWALDir)) {
          if (!walFs.delete(wrongRegionWALDir, true)) {
            LOG.debug("Failed to clean up wrong region WAL directory {}", wrongRegionWALDir);
          }
        }
      }
    }

    LOG.info("Opened {}; next sequenceid={}", this.getRegionInfo().getShortNameToLog(), nextSeqId);

    // A region can be reopened if failed a split; reset flags
    this.closing.set(false);
    this.closed.set(false);

    if (coprocessorHost != null) {
      status.setStatus("Running coprocessor post-open hooks");
      coprocessorHost.postOpen();
    }

    status.markComplete("Region opened successfully");
    return nextSeqId;
  }

  /**
   * Open all Stores.
   * @param reporter
   * @param status
   * @return Highest sequenceId found out in a Store.
   * @throws IOException
   */
  private long initializeStores(CancelableProgressable reporter, MonitoredTask status)
      throws IOException {
    return initializeStores(reporter, status, false);
  }

  private long initializeStores(CancelableProgressable reporter, MonitoredTask status,
      boolean warmup) throws IOException {
    // Load in all the HStores.
    long maxSeqId = -1;
    // initialized to -1 so that we pick up MemstoreTS from column families
    long maxMemstoreTS = -1;

    if (htableDescriptor.getColumnFamilyCount() != 0) {
      // initialize the thread pool for opening stores in parallel.
      ThreadPoolExecutor storeOpenerThreadPool =
        getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog());
      CompletionService completionService = new ExecutorCompletionService<>(storeOpenerThreadPool);

      // initialize each store in parallel
      for (final ColumnFamilyDescriptor family : htableDescriptor.getColumnFamilies()) {
        status.setStatus("Instantiating store for column family " + family);
        completionService.submit(new Callable() {
          @Override
          public HStore call() throws IOException {
            return instantiateHStore(family, warmup);
          }
        });
      }
      boolean allStoresOpened = false;
      boolean hasSloppyStores = false;
      try {
        for (int i = 0; i < htableDescriptor.getColumnFamilyCount(); i++) {
          Future future = completionService.take();
          HStore store = future.get();
          this.stores.put(store.getColumnFamilyDescriptor().getName(), store);
          if (store.isSloppyMemStore()) {
            hasSloppyStores = true;
          }

          long storeMaxSequenceId = store.getMaxSequenceId().orElse(0L);
          maxSeqIdInStores.put(Bytes.toBytes(store.getColumnFamilyName()),
              storeMaxSequenceId);
          if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) {
            maxSeqId = storeMaxSequenceId;
          }
          long maxStoreMemstoreTS = store.getMaxMemStoreTS().orElse(0L);
          if (maxStoreMemstoreTS > maxMemstoreTS) {
            maxMemstoreTS = maxStoreMemstoreTS;
          }
        }
        allStoresOpened = true;
        if(hasSloppyStores) {
          htableDescriptor = TableDescriptorBuilder.newBuilder(htableDescriptor)
                  .setFlushPolicyClassName(FlushNonSloppyStoresFirstPolicy.class.getName())
                  .build();
          LOG.info("Setting FlushNonSloppyStoresFirstPolicy for the region=" + this);
        }
      } catch (InterruptedException e) {
        throw (InterruptedIOException)new InterruptedIOException().initCause(e);
      } catch (ExecutionException e) {
        throw new IOException(e.getCause());
      } finally {
        storeOpenerThreadPool.shutdownNow();
        if (!allStoresOpened) {
          // something went wrong, close all opened stores
          LOG.error("Could not initialize all stores for the region=" + this);
          for (HStore store : this.stores.values()) {
            try {
              store.close();
            } catch (IOException e) {
              LOG.warn("close store {} failed in region {}", store.toString(), this, e);
            }
          }
        }
      }
    }
    return Math.max(maxSeqId, maxMemstoreTS + 1);
  }

  private void initializeWarmup(final CancelableProgressable reporter) throws IOException {
    MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
    // Initialize all the HStores
    status.setStatus("Warming up all the Stores");
    try {
      initializeStores(reporter, status, true);
    } finally {
      status.markComplete("Done warming up.");
    }
  }

  /**
   * @return Map of StoreFiles by column family
   */
  private NavigableMap> getStoreFiles() {
    NavigableMap> allStoreFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR);
    for (HStore store : stores.values()) {
      Collection storeFiles = store.getStorefiles();
      if (storeFiles == null) {
        continue;
      }
      List storeFileNames = new ArrayList<>();
      for (HStoreFile storeFile : storeFiles) {
        storeFileNames.add(storeFile.getPath());
      }
      allStoreFiles.put(store.getColumnFamilyDescriptor().getName(), storeFileNames);
    }
    return allStoreFiles;
  }

  @VisibleForTesting
  protected void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException {
    Map> storeFiles = getStoreFiles();
    RegionEventDescriptor regionOpenDesc = ProtobufUtil.toRegionEventDescriptor(
      RegionEventDescriptor.EventType.REGION_OPEN, getRegionInfo(), openSeqId,
      getRegionServerServices().getServerName(), storeFiles);
    WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionOpenDesc,
        mvcc);
  }

  private void writeRegionCloseMarker(WAL wal) throws IOException {
    Map> storeFiles = getStoreFiles();
    RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor(
      RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), mvcc.getReadPoint(),
      getRegionServerServices().getServerName(), storeFiles);
    WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionEventDesc,
        mvcc);

    // Store SeqId in WAL FileSystem when a region closes
    // checking region folder exists is due to many tests which delete the table folder while a
    // table is still online
    if (getWalFileSystem().exists(getWALRegionDir())) {
      WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(),
        mvcc.getReadPoint());
    }
  }

  /**
   * @return True if this region has references.
   */
  public boolean hasReferences() {
    return stores.values().stream().anyMatch(HStore::hasReferences);
  }

  public void blockUpdates() {
    this.updatesLock.writeLock().lock();
  }

  public void unblockUpdates() {
    this.updatesLock.writeLock().unlock();
  }

  public HDFSBlocksDistribution getHDFSBlocksDistribution() {
    HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
    stores.values().stream().filter(s -> s.getStorefiles() != null)
        .flatMap(s -> s.getStorefiles().stream()).map(HStoreFile::getHDFSBlockDistribution)
        .forEachOrdered(hdfsBlocksDistribution::add);
    return hdfsBlocksDistribution;
  }

  /**
   * This is a helper function to compute HDFS block distribution on demand
   * @param conf configuration
   * @param tableDescriptor TableDescriptor of the table
   * @param regionInfo encoded name of the region
   * @return The HDFS blocks distribution for the given region.
   */
  public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf,
    TableDescriptor tableDescriptor, RegionInfo regionInfo) throws IOException {
    Path tablePath =
      CommonFSUtils.getTableDir(CommonFSUtils.getRootDir(conf), tableDescriptor.getTableName());
    return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath);
  }

  /**
   * This is a helper function to compute HDFS block distribution on demand
   * @param conf configuration
   * @param tableDescriptor TableDescriptor of the table
   * @param regionInfo encoded name of the region
   * @param tablePath the table directory
   * @return The HDFS blocks distribution for the given region.
   * @throws IOException
   */
  public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf,
      TableDescriptor tableDescriptor, RegionInfo regionInfo, Path tablePath) throws IOException {
    HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
    FileSystem fs = tablePath.getFileSystem(conf);

    HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
    for (ColumnFamilyDescriptor family : tableDescriptor.getColumnFamilies()) {
      List locatedFileStatusList = HRegionFileSystem
          .getStoreFilesLocatedStatus(regionFs, family.getNameAsString(), true);
      if (locatedFileStatusList == null) {
        continue;
      }

      for (LocatedFileStatus status : locatedFileStatusList) {
        Path p = status.getPath();
        if (StoreFileInfo.isReference(p) || HFileLink.isHFileLink(p)) {
          // Only construct StoreFileInfo object if its not a hfile, save obj
          // creation
          StoreFileInfo storeFileInfo = new StoreFileInfo(conf, fs, status);
          hdfsBlocksDistribution.add(storeFileInfo
              .computeHDFSBlocksDistribution(fs));
        } else if (StoreFileInfo.isHFile(p)) {
          // If its a HFile, then lets just add to the block distribution
          // lets not create more objects here, not even another HDFSBlocksDistribution
          FSUtils.addToHDFSBlocksDistribution(hdfsBlocksDistribution,
              status.getBlockLocations());
        } else {
          throw new IOException("path=" + p
              + " doesn't look like a valid StoreFile");
        }
      }
    }
    return hdfsBlocksDistribution;
  }

  /**
   * Increase the size of mem store in this region and the size of global mem
   * store
   */
  void incMemStoreSize(MemStoreSize mss) {
    incMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(),
      mss.getCellsCount());
  }

  void incMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta,
      int cellsCountDelta) {
    if (this.rsAccounting != null) {
      rsAccounting.incGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta);
    }
    long dataSize = this.memStoreSizing.incMemStoreSize(dataSizeDelta, heapSizeDelta,
      offHeapSizeDelta, cellsCountDelta);
    checkNegativeMemStoreDataSize(dataSize, dataSizeDelta);
  }

  void decrMemStoreSize(MemStoreSize mss) {
    decrMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(),
      mss.getCellsCount());
  }

  void decrMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta,
      int cellsCountDelta) {
    if (this.rsAccounting != null) {
      rsAccounting.decGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta);
    }
    long dataSize = this.memStoreSizing.decMemStoreSize(dataSizeDelta, heapSizeDelta,
      offHeapSizeDelta, cellsCountDelta);
    checkNegativeMemStoreDataSize(dataSize, -dataSizeDelta);
  }

  private void checkNegativeMemStoreDataSize(long memStoreDataSize, long delta) {
    // This is extremely bad if we make memStoreSizing negative. Log as much info on the offending
    // caller as possible. (memStoreSizing might be a negative value already -- freeing memory)
    if (memStoreDataSize < 0) {
      LOG.error("Asked to modify this region's (" + this.toString()
          + ") memStoreSizing to a negative value which is incorrect. Current memStoreSizing="
          + (memStoreDataSize - delta) + ", delta=" + delta, new Exception());
    }
  }

  @Override
  public RegionInfo getRegionInfo() {
    return this.fs.getRegionInfo();
  }

  /**
   * @return Instance of {@link RegionServerServices} used by this HRegion.
   * Can be null.
   */
  RegionServerServices getRegionServerServices() {
    return this.rsServices;
  }

  @Override
  public long getReadRequestsCount() {
    return readRequestsCount.sum();
  }

  @Override
  public long getFilteredReadRequestsCount() {
    return filteredReadRequestsCount.sum();
  }

  @Override
  public long getWriteRequestsCount() {
    return writeRequestsCount.sum();
  }

  @Override
  public long getMemStoreDataSize() {
    return memStoreSizing.getDataSize();
  }

  @Override
  public long getMemStoreHeapSize() {
    return memStoreSizing.getHeapSize();
  }

  @Override
  public long getMemStoreOffHeapSize() {
    return memStoreSizing.getOffHeapSize();
  }

  /** @return store services for this region, to access services required by store level needs */
  public RegionServicesForStores getRegionServicesForStores() {
    return regionServicesForStores;
  }

  @Override
  public long getNumMutationsWithoutWAL() {
    return numMutationsWithoutWAL.sum();
  }

  @Override
  public long getDataInMemoryWithoutWAL() {
    return dataInMemoryWithoutWAL.sum();
  }

  @Override
  public long getBlockedRequestsCount() {
    return blockedRequestsCount.sum();
  }

  @Override
  public long getCheckAndMutateChecksPassed() {
    return checkAndMutateChecksPassed.sum();
  }

  @Override
  public long getCheckAndMutateChecksFailed() {
    return checkAndMutateChecksFailed.sum();
  }

  // TODO Needs to check whether we should expose our metrics system to CPs. If CPs themselves doing
  // the op and bypassing the core, this might be needed? Should be stop supporting the bypass
  // feature?
  public MetricsRegion getMetrics() {
    return metricsRegion;
  }

  @Override
  public boolean isClosed() {
    return this.closed.get();
  }

  @Override
  public boolean isClosing() {
    return this.closing.get();
  }

  @Override
  public boolean isReadOnly() {
    return this.writestate.isReadOnly();
  }

  @Override
  public boolean isAvailable() {
    return !isClosed() && !isClosing();
  }

  @Override
  public boolean isSplittable() {
    return isAvailable() && !hasReferences();
  }

  @Override
  public boolean isMergeable() {
    if (!isAvailable()) {
      LOG.debug("Region " + this
          + " is not mergeable because it is closing or closed");
      return false;
    }
    if (hasReferences()) {
      LOG.debug("Region " + this
          + " is not mergeable because it has references");
      return false;
    }

    return true;
  }

  public boolean areWritesEnabled() {
    synchronized(this.writestate) {
      return this.writestate.writesEnabled;
    }
  }

  @VisibleForTesting
  public MultiVersionConcurrencyControl getMVCC() {
    return mvcc;
  }

  @Override
  public long getMaxFlushedSeqId() {
    return maxFlushedSeqId;
  }

  /**
   * @return readpoint considering given IsolationLevel. Pass {@code null} for default
   */
  public long getReadPoint(IsolationLevel isolationLevel) {
    if (isolationLevel != null && isolationLevel == IsolationLevel.READ_UNCOMMITTED) {
      // This scan can read even uncommitted transactions
      return Long.MAX_VALUE;
    }
    return mvcc.getReadPoint();
  }

  public boolean isLoadingCfsOnDemandDefault() {
    return this.isLoadingCfsOnDemandDefault;
  }

  /**
   * Close down this HRegion.  Flush the cache, shut down each HStore, don't
   * service any more calls.
   *
   * 
This method could take some time to execute, so don't call it from a
   * time-sensitive thread.
   *
   * @return Vector of all the storage files that the HRegion's component
   * HStores make use of.  It's a list of all StoreFile objects. Returns empty
   * vector if already closed and null if judged that it should not close.
   *
   * @throws IOException e
   * @throws DroppedSnapshotException Thrown when replay of wal is required
   * because a Snapshot was not properly persisted. The region is put in closing mode, and the
   * caller MUST abort after this.
   */
  public Map> close() throws IOException {
    return close(false);
  }

  private final Object closeLock = new Object();

  /** Conf key for fair locking policy */
  public static final String FAIR_REENTRANT_CLOSE_LOCK =
      "hbase.regionserver.fair.region.close.lock";
  public static final boolean DEFAULT_FAIR_REENTRANT_CLOSE_LOCK = true;
  /** Conf key for the periodic flush interval */
  public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL =
      "hbase.regionserver.optionalcacheflushinterval";
  /** Default interval for the memstore flush */
  public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000;
  /** Default interval for System tables memstore flush */
  public static final int SYSTEM_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes

  /** Conf key to force a flush if there are already enough changes for one region in memstore */
  public static final String MEMSTORE_FLUSH_PER_CHANGES =
      "hbase.regionserver.flush.per.changes";
  public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions
  /**
   * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes
   * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region
   */
  public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G

  /**
   * Close down this HRegion.  Flush the cache unless abort parameter is true,
   * Shut down each HStore, don't service any more calls.
   *
   * This method could take some time to execute, so don't call it from a
   * time-sensitive thread.
   *
   * @param abort true if server is aborting (only during testing)
   * @return Vector of all the storage files that the HRegion's component
   * HStores make use of.  It's a list of StoreFile objects.  Can be null if
   * we are not to close at this time or we are already closed.
   *
   * @throws IOException e
   * @throws DroppedSnapshotException Thrown when replay of wal is required
   * because a Snapshot was not properly persisted. The region is put in closing mode, and the
   * caller MUST abort after this.
   */
  public Map> close(boolean abort) throws IOException {
    // Only allow one thread to close at a time. Serialize them so dual
    // threads attempting to close will run up against each other.
    MonitoredTask status = TaskMonitor.get().createStatus(
        "Closing region " + this.getRegionInfo().getEncodedName() +
        (abort ? " due to abort" : ""));
    status.enableStatusJournal(true);
    status.setStatus("Waiting for close lock");
    try {
      synchronized (closeLock) {
        return doClose(abort, status);
      }
    } finally {
      if (LOG.isDebugEnabled()) {
        LOG.debug("Region close journal for {}:\n{}", this.getRegionInfo().getEncodedName(),
          status.prettyPrintJournal());
      }
      status.cleanup();
    }
  }

  /**
   * Exposed for some very specific unit tests.
   */
  @VisibleForTesting
  public void setClosing(boolean closing) {
    this.closing.set(closing);
  }

  /**
   * The {@link HRegion#doClose} will block forever if someone tries proving the dead lock via the unit test.
   * Instead of blocking, the {@link HRegion#doClose} will throw exception if you set the timeout.
   * @param timeoutForWriteLock the second time to wait for the write lock in {@link HRegion#doClose}
   */
  @VisibleForTesting
  public void setTimeoutForWriteLock(long timeoutForWriteLock) {
    assert timeoutForWriteLock >= 0;
    this.timeoutForWriteLock = timeoutForWriteLock;
  }

  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK_EXCEPTION_PATH",
      justification="I think FindBugs is confused")
  private Map> doClose(boolean abort, MonitoredTask status)
      throws IOException {
    if (isClosed()) {
      LOG.warn("Region " + this + " already closed");
      return null;
    }

    if (coprocessorHost != null) {
      status.setStatus("Running coprocessor pre-close hooks");
      this.coprocessorHost.preClose(abort);
    }
    status.setStatus("Disabling compacts and flushes for region");
    boolean canFlush = true;
    synchronized (writestate) {
      // Disable compacting and flushing by background threads for this
      // region.
      canFlush = !writestate.readOnly;
      writestate.writesEnabled = false;
      LOG.debug("Closing {}, disabling compactions & flushes",
          this.getRegionInfo().getEncodedName());
      waitForFlushesAndCompactions();
    }
    // If we were not just flushing, is it worth doing a preflush...one
    // that will clear out of the bulk of the memstore before we put up
    // the close flag?
    if (!abort && worthPreFlushing() && canFlush) {
      status.setStatus("Pre-flushing region before close");
      LOG.info("Running close preflush of {}", this.getRegionInfo().getEncodedName());
      try {
        internalFlushcache(status);
      } catch (IOException ioe) {
        // Failed to flush the region. Keep going.
        status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage());
      }
    }

    if (timeoutForWriteLock == null
        || timeoutForWriteLock == Long.MAX_VALUE) {
      // block waiting for the lock for closing
      lock.writeLock().lock(); // FindBugs: Complains UL_UNRELEASED_LOCK_EXCEPTION_PATH but seems fine
    } else {
      try {
        boolean succeed = lock.writeLock().tryLock(timeoutForWriteLock, TimeUnit.SECONDS);
        if (!succeed) {
          throw new IOException("Failed to get write lock when closing region");
        }
      } catch (InterruptedException e) {
        throw (InterruptedIOException) new InterruptedIOException().initCause(e);
      }
    }
    this.closing.set(true);
    LOG.info("Closing region {}", this);
    status.setStatus("Disabling writes for close");
    try {
      if (this.isClosed()) {
        status.abort("Already got closed by another process");
        // SplitTransaction handles the null
        return null;
      }
      LOG.debug("Updates disabled for region " + this);
      // Don't flush the cache if we are aborting
      if (!abort && canFlush) {
        int failedfFlushCount = 0;
        int flushCount = 0;
        long tmp = 0;
        long remainingSize = this.memStoreSizing.getDataSize();
        while (remainingSize > 0) {
          try {
            internalFlushcache(status);
            if(flushCount >0) {
              LOG.info("Running extra flush, " + flushCount +
                  " (carrying snapshot?) " + this);
            }
            flushCount++;
            tmp = this.memStoreSizing.getDataSize();
            if (tmp >= remainingSize) {
              failedfFlushCount++;
            }
            remainingSize = tmp;
            if (failedfFlushCount > 5) {
              // If we failed 5 times and are unable to clear memory, abort
              // so we do not lose data
              throw new DroppedSnapshotException("Failed clearing memory after " +
                  flushCount + " attempts on region: " +
                  Bytes.toStringBinary(getRegionInfo().getRegionName()));
            }
          } catch (IOException ioe) {
            status.setStatus("Failed flush " + this + ", putting online again");
            synchronized (writestate) {
              writestate.writesEnabled = true;
            }
            // Have to throw to upper layers.  I can't abort server from here.
            throw ioe;
          }
        }
      }

      Map> result = new TreeMap<>(Bytes.BYTES_COMPARATOR);
      if (!stores.isEmpty()) {
        // initialize the thread pool for closing stores in parallel.
        ThreadPoolExecutor storeCloserThreadPool =
          getStoreOpenAndCloseThreadPool("StoreCloser-" +
            getRegionInfo().getRegionNameAsString());
        CompletionService>> completionService =
          new ExecutorCompletionService<>(storeCloserThreadPool);

        // close each store in parallel
        for (HStore store : stores.values()) {
          MemStoreSize mss = store.getFlushableSize();
          if (!(abort || mss.getDataSize() == 0 || writestate.readOnly)) {
            if (getRegionServerServices() != null) {
              getRegionServerServices().abort("Assertion failed while closing store "
                + getRegionInfo().getRegionNameAsString() + " " + store
                + ". flushableSize expected=0, actual={" + mss
                + "}. Current memStoreSize=" + this.memStoreSizing.getMemStoreSize() +
                  ". Maybe a coprocessor "
                + "operation failed and left the memstore in a partially updated state.", null);
            }
          }
          completionService
              .submit(new Callable>>() {
                @Override
                public Pair> call() throws IOException {
                  return new Pair<>(store.getColumnFamilyDescriptor().getName(), store.close());
                }
              });
        }
        try {
          for (int i = 0; i < stores.size(); i++) {
            Future>> future = completionService.take();
            Pair> storeFiles = future.get();
            List familyFiles = result.get(storeFiles.getFirst());
            if (familyFiles == null) {
              familyFiles = new ArrayList<>();
              result.put(storeFiles.getFirst(), familyFiles);
            }
            familyFiles.addAll(storeFiles.getSecond());
          }
        } catch (InterruptedException e) {
          throw (InterruptedIOException)new InterruptedIOException().initCause(e);
        } catch (ExecutionException e) {
          Throwable cause = e.getCause();
          if (cause instanceof IOException) {
            throw (IOException) cause;
          }
          throw new IOException(cause);
        } finally {
          storeCloserThreadPool.shutdownNow();
        }
      }

      status.setStatus("Writing region close event to WAL");
      // Always write close marker to wal even for read only table. This is not a big problem as we
      // do not write any data into the region; it is just a meta edit in the WAL file.
      if (!abort && wal != null && getRegionServerServices() != null &&
        RegionReplicaUtil.isDefaultReplica(getRegionInfo())) {
        writeRegionCloseMarker(wal);
      }

      this.closed.set(true);
      if (!canFlush) {
        decrMemStoreSize(this.memStoreSizing.getMemStoreSize());
      } else if (this.memStoreSizing.getDataSize() != 0) {
        LOG.error("Memstore data size is {} in region {}", this.memStoreSizing.getDataSize(), this);
      }
      if (coprocessorHost != null) {
        status.setStatus("Running coprocessor post-close hooks");
        this.coprocessorHost.postClose(abort);
      }
      if (this.metricsRegion != null) {
        this.metricsRegion.close();
      }
      if (this.metricsRegionWrapper != null) {
        Closeables.close(this.metricsRegionWrapper, true);
      }
      status.markComplete("Closed");
      LOG.info("Closed {}", this);
      return result;
    } finally {
      lock.writeLock().unlock();
    }
  }

  /** Wait for all current flushes and compactions of the region to complete */
  // TODO HBASE-18906. Check the usage (if any) in Phoenix and expose this or give alternate way for
  // Phoenix needs.
  public void waitForFlushesAndCompactions() {
    synchronized (writestate) {
      if (this.writestate.readOnly) {
        // we should not wait for replayed flushed if we are read only (for example in case the
        // region is a secondary replica).
        return;
      }
      boolean interrupted = false;
      try {
        while (writestate.compacting.get() > 0 || writestate.flushing) {
          LOG.debug("waiting for " + writestate.compacting + " compactions"
            + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this);
          try {
            writestate.wait();
          } catch (InterruptedException iex) {
            // essentially ignore and propagate the interrupt back up
            LOG.warn("Interrupted while waiting in region {}", this);
            interrupted = true;
            break;
          }
        }
      } finally {
        if (interrupted) {
          Thread.currentThread().interrupt();
        }
      }
    }
  }

  /**
   * Wait for all current flushes of the region to complete
   */
  public void waitForFlushes() {
    waitForFlushes(0);// Unbound wait
  }

  @Override
  public boolean waitForFlushes(long timeout) {
    synchronized (writestate) {
      if (this.writestate.readOnly) {
        // we should not wait for replayed flushed if we are read only (for example in case the
        // region is a secondary replica).
        return true;
      }
      if (!writestate.flushing) return true;
      long start = System.currentTimeMillis();
      long duration = 0;
      boolean interrupted = false;
      LOG.debug("waiting for cache flush to complete for region " + this);
      try {
        while (writestate.flushing) {
          if (timeout > 0 && duration >= timeout) break;
          try {
            long toWait = timeout == 0 ? 0 : (timeout - duration);
            writestate.wait(toWait);
          } catch (InterruptedException iex) {
            // essentially ignore and propagate the interrupt back up
            LOG.warn("Interrupted while waiting in region {}", this);
            interrupted = true;
            break;
          } finally {
            duration = System.currentTimeMillis() - start;
          }
        }
      } finally {
        if (interrupted) {
          Thread.currentThread().interrupt();
        }
      }
      LOG.debug("Waited {} ms for region {} flush to complete", duration, this);
      return !(writestate.flushing);
    }
  }

  @Override
  public Configuration getReadOnlyConfiguration() {
    return new ReadOnlyConfiguration(this.conf);
  }

  protected ThreadPoolExecutor getStoreOpenAndCloseThreadPool(
      final String threadNamePrefix) {
    int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount());
    int maxThreads = Math.min(numStores,
        conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
            HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX));
    return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
  }

  protected ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(
      final String threadNamePrefix) {
    int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount());
    int maxThreads = Math.max(1,
        conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
            HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)
            / numStores);
    return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
  }

  static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads,
      final String threadNamePrefix) {
    return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS,
      new ThreadFactory() {
        private int count = 1;

        @Override
        public Thread newThread(Runnable r) {
          return new Thread(r, threadNamePrefix + "-" + count++);
        }
      });
  }

   /**
    * @return True if its worth doing a flush before we put up the close flag.
    */
  private boolean worthPreFlushing() {
    return this.memStoreSizing.getDataSize() >
      this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
  }

  //////////////////////////////////////////////////////////////////////////////
  // HRegion accessors
  //////////////////////////////////////////////////////////////////////////////

  @Override
  public TableDescriptor getTableDescriptor() {
    return this.htableDescriptor;
  }

  @VisibleForTesting
  void setTableDescriptor(TableDescriptor desc) {
    htableDescriptor = desc;
  }

  /** @return WAL in use for this region */
  public WAL getWAL() {
    return this.wal;
  }

  public BlockCache getBlockCache() {
    return this.blockCache;
  }

  /**
   * Only used for unit test which doesn't start region server.
   */
  @VisibleForTesting
  public void setBlockCache(BlockCache blockCache) {
    this.blockCache = blockCache;
  }

  public MobFileCache getMobFileCache() {
    return this.mobFileCache;
  }

  /**
   * Only used for unit test which doesn't start region server.
   */
  @VisibleForTesting
  public void setMobFileCache(MobFileCache mobFileCache) {
    this.mobFileCache = mobFileCache;
  }

  /**
   * @return split policy for this region.
   */
  public RegionSplitPolicy getSplitPolicy() {
    return this.splitPolicy;
  }

  /**
   * A split takes the config from the parent region & passes it to the daughter
   * region's constructor. If 'conf' was passed, you would end up using the HTD
   * of the parent region in addition to the new daughter HTD. Pass 'baseConf'
   * to the daughter regions to avoid this tricky dedupe problem.
   * @return Configuration object
   */
  Configuration getBaseConf() {
    return this.baseConf;
  }

  /** @return {@link FileSystem} being used by this region */
  public FileSystem getFilesystem() {
    return fs.getFileSystem();
  }

  /** @return the {@link HRegionFileSystem} used by this region */
  public HRegionFileSystem getRegionFileSystem() {
    return this.fs;
  }

  /** @return the WAL {@link HRegionFileSystem} used by this region */
  HRegionWALFileSystem getRegionWALFileSystem() throws IOException {
    return new HRegionWALFileSystem(conf, getWalFileSystem(),
      CommonFSUtils.getWALTableDir(conf, htableDescriptor.getTableName()), fs.getRegionInfo());
  }

  /** @return the WAL {@link FileSystem} being used by this region */
  FileSystem getWalFileSystem() throws IOException {
    if (walFS == null) {
      walFS = CommonFSUtils.getWALFileSystem(conf);
    }
    return walFS;
  }

  /**
   * @return the Region directory under WALRootDirectory
   * @throws IOException if there is an error getting WALRootDir
   */
  @VisibleForTesting
  public Path getWALRegionDir() throws IOException {
    if (regionDir == null) {
      regionDir = CommonFSUtils.getWALRegionDir(conf, getRegionInfo().getTable(),
        getRegionInfo().getEncodedName());
    }
    return regionDir;
  }

  @Override
  public long getEarliestFlushTimeForAllStores() {
    return Collections.min(lastStoreFlushTimeMap.values());
  }

  @Override
  public long getOldestHfileTs(boolean majorCompactionOnly) throws IOException {
    long result = Long.MAX_VALUE;
    for (HStore store : stores.values()) {
      Collection storeFiles = store.getStorefiles();
      if (storeFiles == null) {
        continue;
      }
      for (HStoreFile file : storeFiles) {
        StoreFileReader sfReader = file.getReader();
        if (sfReader == null) {
          continue;
        }
        HFile.Reader reader = sfReader.getHFileReader();
        if (reader == null) {
          continue;
        }
        if (majorCompactionOnly) {
          byte[] val = reader.getHFileInfo().get(MAJOR_COMPACTION_KEY);
          if (val == null || !Bytes.toBoolean(val)) {
            continue;
          }
        }
        result = Math.min(result, reader.getFileContext().getFileCreateTime());
      }
    }
    return result == Long.MAX_VALUE ? 0 : result;
  }

  RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) {
    long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId;
    byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes();
    regionLoadBldr.clearStoreCompleteSequenceId();
    for (byte[] familyName : this.stores.keySet()) {
      long earliest = this.wal.getEarliestMemStoreSeqNum(encodedRegionName, familyName);
      // Subtract - 1 to go earlier than the current oldest, unflushed edit in memstore; this will
      // give us a sequence id that is for sure flushed. We want edit replay to start after this
      // sequence id in this region. If NO_SEQNUM, use the regions maximum flush id.
      long csid = (earliest == HConstants.NO_SEQNUM)? lastFlushOpSeqIdLocal: earliest - 1;
      regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId.newBuilder()
          .setFamilyName(UnsafeByteOperations.unsafeWrap(familyName)).setSequenceId(csid).build());
    }
    return regionLoadBldr.setCompleteSequenceId(getMaxFlushedSeqId());
  }

  //////////////////////////////////////////////////////////////////////////////
  // HRegion maintenance.
  //
  // These methods are meant to be called periodically by the HRegionServer for
  // upkeep.
  //////////////////////////////////////////////////////////////////////////////
  /**
   * Do preparation for pending compaction.
   * @throws IOException
   */
  protected void doRegionCompactionPrep() throws IOException {
  }

  /**
   * Synchronously compact all stores in the region.
   * 
This operation could block for a long time, so don't call it from a
   * time-sensitive thread.
   * 
Note that no locks are taken to prevent possible conflicts between
   * compaction and splitting activities. The regionserver does not normally compact
   * and split in parallel. However by calling this method you may introduce
   * unexpected and unhandled concurrency. Don't do this unless you know what
   * you are doing.
   *
   * @param majorCompaction True to force a major compaction regardless of thresholds
   * @throws IOException
   */
  public void compact(boolean majorCompaction) throws IOException {
    if (majorCompaction) {
      stores.values().forEach(HStore::triggerMajorCompaction);
    }
    for (HStore s : stores.values()) {
      Optional compaction = s.requestCompaction();
      if (compaction.isPresent()) {
        ThroughputController controller = null;
        if (rsServices != null) {
          controller = CompactionThroughputControllerFactory.create(rsServices, conf);
        }
        if (controller == null) {
          controller = NoLimitThroughputController.INSTANCE;
        }
        compact(compaction.get(), s, controller, null);
      }
    }
  }

  /**
   * This is a helper function that compact all the stores synchronously.
   * 

   * It is used by utilities and testing
   */
  @VisibleForTesting
  public void compactStores() throws IOException {
    for (HStore s : stores.values()) {
      Optional compaction = s.requestCompaction();
      if (compaction.isPresent()) {
        compact(compaction.get(), s, NoLimitThroughputController.INSTANCE, null);
      }
    }
  }

  /**
   * This is a helper function that compact the given store.
   * 

   * It is used by utilities and testing
   */
  @VisibleForTesting
  void compactStore(byte[] family, ThroughputController throughputController) throws IOException {
    HStore s = getStore(family);
    Optional compaction = s.requestCompaction();
    if (compaction.isPresent()) {
      compact(compaction.get(), s, throughputController, null);
    }
  }

  /**
   * Called by compaction thread and after region is opened to compact the
   * HStores if necessary.
   *
   * 
This operation could block for a long time, so don't call it from a
   * time-sensitive thread.
   *
   * Note that no locking is necessary at this level because compaction only
   * conflicts with a region split, and that cannot happen because the region
   * server does them sequentially and not in parallel.
   *
   * @param compaction Compaction details, obtained by requestCompaction()
   * @param throughputController
   * @return whether the compaction completed
   */
  public boolean compact(CompactionContext compaction, HStore store,
      ThroughputController throughputController) throws IOException {
    return compact(compaction, store, throughputController, null);
  }

  public boolean compact(CompactionContext compaction, HStore store,
      ThroughputController throughputController, User user) throws IOException {
    assert compaction != null && compaction.hasSelection();
    assert !compaction.getRequest().getFiles().isEmpty();
    if (this.closing.get() || this.closed.get()) {
      LOG.debug("Skipping compaction on " + this + " because closing/closed");
      store.cancelRequestedCompaction(compaction);
      return false;
    }
    MonitoredTask status = null;
    boolean requestNeedsCancellation = true;
    /*
     * We are trying to remove / relax the region read lock for compaction.
     * Let's see what are the potential race conditions among the operations (user scan,
     * region split, region close and region bulk load).
     *
     *  user scan ---> region read lock
     *  region split --> region close first --> region write lock
     *  region close --> region write lock
     *  region bulk load --> region write lock
     *
     * read lock is compatible with read lock. ---> no problem with user scan/read
     * region bulk load does not cause problem for compaction (no consistency problem, store lock
     *  will help the store file accounting).
     * They can run almost concurrently at the region level.
     *
     * The only remaining race condition is between the region close and compaction.
     * So we will evaluate, below, how region close intervenes with compaction if compaction does
     * not acquire region read lock.
     *
     * Here are the steps for compaction:
     * 1. obtain list of StoreFile's
     * 2. create StoreFileScanner's based on list from #1
     * 3. perform compaction and save resulting files under tmp dir
     * 4. swap in compacted files
     *
     * #1 is guarded by store lock. This patch does not change this --> no worse or better
     * For #2, we obtain smallest read point (for region) across all the Scanners (for both default
     * compactor and stripe compactor).
     * The read points are for user scans. Region keeps the read points for all currently open
     * user scanners.
     * Compaction needs to know the smallest read point so that during re-write of the hfiles,
     * it can remove the mvcc points for the cells if their mvccs are older than the smallest
     * since they are not needed anymore.
     * This will not conflict with compaction.
     * For #3, it can be performed in parallel to other operations.
     * For #4 bulk load and compaction don't conflict with each other on the region level
     *   (for multi-family atomicy).
     * Region close and compaction are guarded pretty well by the 'writestate'.
     * In HRegion#doClose(), we have :
     * synchronized (writestate) {
     *   // Disable compacting and flushing by background threads for this
     *   // region.
     *   canFlush = !writestate.readOnly;
     *   writestate.writesEnabled = false;
     *   LOG.debug("Closing " + this + ": disabling compactions & flushes");
     *   waitForFlushesAndCompactions();
     * }
     * waitForFlushesAndCompactions() would wait for writestate.compacting to come down to 0.
     * and in HRegion.compact()
     *  try {
     *    synchronized (writestate) {
     *    if (writestate.writesEnabled) {
     *      wasStateSet = true;
     *      ++writestate.compacting;
     *    } else {
     *      String msg = "NOT compacting region " + this + ". Writes disabled.";
     *      LOG.info(msg);
     *      status.abort(msg);
     *      return false;
     *    }
     *  }
     * Also in compactor.performCompaction():
     * check periodically to see if a system stop is requested
     * if (closeCheckInterval > 0) {
     *   bytesWritten += len;
     *   if (bytesWritten > closeCheckInterval) {
     *     bytesWritten = 0;
     *     if (!store.areWritesEnabled()) {
     *       progress.cancel();
     *       return false;
     *     }
     *   }
     * }
     */
    try {
      byte[] cf = Bytes.toBytes(store.getColumnFamilyName());
      if (stores.get(cf) != store) {
        LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this
            + " has been re-instantiated, cancel this compaction request. "
            + " It may be caused by the roll back of split transaction");
        return false;
      }

      status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this);
      status.enableStatusJournal(false);
      if (this.closed.get()) {
        String msg = "Skipping compaction on " + this + " because closed";
        LOG.debug(msg);
        status.abort(msg);
        return false;
      }
      boolean wasStateSet = false;
      try {
        synchronized (writestate) {
          if (writestate.writesEnabled) {
            wasStateSet = true;
            writestate.compacting.incrementAndGet();
          } else {
            String msg = "NOT compacting region " + this + ". Writes disabled.";
            LOG.info(msg);
            status.abort(msg);
            return false;
          }
        }
        LOG.info("Starting compaction of {} in {}{}", store, this,
            (compaction.getRequest().isOffPeak()?" as an off-peak compaction":""));
        doRegionCompactionPrep();
        try {
          status.setStatus("Compacting store " + store);
          // We no longer need to cancel the request on the way out of this
          // method because Store#compact will clean up unconditionally
          requestNeedsCancellation = false;
          store.compact(compaction, throughputController, user);
        } catch (InterruptedIOException iioe) {
          String msg = "region " + this + " compaction interrupted";
          LOG.info(msg, iioe);
          status.abort(msg);
          return false;
        }
      } finally {
        if (wasStateSet) {
          synchronized (writestate) {
            writestate.compacting.decrementAndGet();
            if (writestate.compacting.get() <= 0) {
              writestate.notifyAll();
            }
          }
        }
      }
      status.markComplete("Compaction complete");
      return true;
    } finally {
      if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
      if (status != null) {
        LOG.debug("Compaction status journal for {}:\n{}", this.getRegionInfo().getEncodedName(),
          status.prettyPrintJournal());
        status.cleanup();
      }
    }
  }

  /**
   * Flush the cache.
   *
   * 
When this method is called the cache will be flushed unless:
   * 

   *   the cache is empty
   *   the region is closed.
   *   a flush is already in progress
   *   writes are disabled
   * 
   *
   * This method may block for some time, so it should not be called from a
   * time-sensitive thread.
   * @param force whether we want to force a flush of all stores
   * @return FlushResult indicating whether the flush was successful or not and if
   * the region needs compacting
   *
   * @throws IOException general io exceptions
   * because a snapshot was not properly persisted.
   */
  // TODO HBASE-18905. We might have to expose a requestFlush API for CPs
  public FlushResult flush(boolean force) throws IOException {
    return flushcache(force, false, FlushLifeCycleTracker.DUMMY);
  }

  public interface FlushResult {
    enum Result {
      FLUSHED_NO_COMPACTION_NEEDED,
      FLUSHED_COMPACTION_NEEDED,
      // Special case where a flush didn't run because there's nothing in the memstores. Used when
      // bulk loading to know when we can still load even if a flush didn't happen.
      CANNOT_FLUSH_MEMSTORE_EMPTY,
      CANNOT_FLUSH
    }

    /** @return the detailed result code */
    Result getResult();

    /** @return true if the memstores were flushed, else false */
    boolean isFlushSucceeded();

    /** @return True if the flush requested a compaction, else false */
    boolean isCompactionNeeded();
  }

  /**
   * Flush the cache.
   *
   * When this method is called the cache will be flushed unless:
   * 

   *   the cache is empty
   *   the region is closed.
   *   a flush is already in progress
   *   writes are disabled
   * 
   *
   * This method may block for some time, so it should not be called from a
   * time-sensitive thread.
   * @param forceFlushAllStores whether we want to flush all stores
   * @param writeFlushRequestWalMarker whether to write the flush request marker to WAL
   * @param tracker used to track the life cycle of this flush
   * @return whether the flush is success and whether the region needs compacting
   *
   * @throws IOException general io exceptions
   * @throws DroppedSnapshotException Thrown when replay of wal is required
   * because a Snapshot was not properly persisted. The region is put in closing mode, and the
   * caller MUST abort after this.
   */
  public FlushResultImpl flushcache(boolean forceFlushAllStores, boolean writeFlushRequestWalMarker,
      FlushLifeCycleTracker tracker) throws IOException {
    // fail-fast instead of waiting on the lock
    if (this.closing.get()) {
      String msg = "Skipping flush on " + this + " because closing";
      LOG.debug(msg);
      return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
    }
    MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
    status.enableStatusJournal(false);
    status.setStatus("Acquiring readlock on region");
    // block waiting for the lock for flushing cache
    lock.readLock().lock();
    try {
      if (this.closed.get()) {
        String msg = "Skipping flush on " + this + " because closed";
        LOG.debug(msg);
        status.abort(msg);
        return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
      }
      if (coprocessorHost != null) {
        status.setStatus("Running coprocessor pre-flush hooks");
        coprocessorHost.preFlush(tracker);
      }
      // TODO: this should be managed within memstore with the snapshot, updated only after flush
      // successful
      if (numMutationsWithoutWAL.sum() > 0) {
        numMutationsWithoutWAL.reset();
        dataInMemoryWithoutWAL.reset();
      }
      synchronized (writestate) {
        if (!writestate.flushing && writestate.writesEnabled) {
          this.writestate.flushing = true;
        } else {
          if (LOG.isDebugEnabled()) {
            LOG.debug("NOT flushing memstore for region " + this
                + ", flushing=" + writestate.flushing + ", writesEnabled="
                + writestate.writesEnabled);
          }
          String msg = "Not flushing since "
              + (writestate.flushing ? "already flushing"
              : "writes not enabled");
          status.abort(msg);
          return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
        }
      }

      try {
        Collection specificStoresToFlush =
            forceFlushAllStores ? stores.values() : flushPolicy.selectStoresToFlush();
        FlushResultImpl fs =
            internalFlushcache(specificStoresToFlush, status, writeFlushRequestWalMarker, tracker);

        if (coprocessorHost != null) {
          status.setStatus("Running post-flush coprocessor hooks");
          coprocessorHost.postFlush(tracker);
        }

        if(fs.isFlushSucceeded()) {
          flushesQueued.reset();
        }

        status.markComplete("Flush successful " + fs.toString());
        return fs;
      } finally {
        synchronized (writestate) {
          writestate.flushing = false;
          this.writestate.flushRequested = false;
          writestate.notifyAll();
        }
      }
    } finally {
      lock.readLock().unlock();
      LOG.debug("Flush status journal for {}:\n{}", this.getRegionInfo().getEncodedName(),
        status.prettyPrintJournal());
      status.cleanup();
    }
  }

  /**
   * Should the store be flushed because it is old enough.
   * 

   * Every FlushPolicy should call this to determine whether a store is old enough to flush (except
   * that you always flush all stores). Otherwise the method will always
   * returns true which will make a lot of flush requests.
   */
  boolean shouldFlushStore(HStore store) {
    long earliest = this.wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(),
      store.getColumnFamilyDescriptor().getName()) - 1;
    if (earliest > 0 && earliest + flushPerChanges < mvcc.getReadPoint()) {
      if (LOG.isDebugEnabled()) {
        LOG.debug("Flush column family " + store.getColumnFamilyName() + " of " +
          getRegionInfo().getEncodedName() + " because unflushed sequenceid=" + earliest +
          " is > " + this.flushPerChanges + " from current=" + mvcc.getReadPoint());
      }
      return true;
    }
    if (this.flushCheckInterval <= 0) {
      return false;
    }
    long now = EnvironmentEdgeManager.currentTime();
    if (store.timeOfOldestEdit() < now - this.flushCheckInterval) {
      if (LOG.isDebugEnabled()) {
        LOG.debug("Flush column family: " + store.getColumnFamilyName() + " of " +
          getRegionInfo().getEncodedName() + " because time of oldest edit=" +
            store.timeOfOldestEdit() + " is > " + this.flushCheckInterval + " from now =" + now);
      }
      return true;
    }
    return false;
  }

  /**
   * Should the memstore be flushed now
   */
  boolean shouldFlush(final StringBuilder whyFlush) {
    whyFlush.setLength(0);
    // This is a rough measure.
    if (this.maxFlushedSeqId > 0
          && (this.maxFlushedSeqId + this.flushPerChanges < this.mvcc.getReadPoint())) {
      whyFlush.append("more than max edits, " + this.flushPerChanges + ", since last flush");
      return true;
    }
    long modifiedFlushCheckInterval = flushCheckInterval;
    if (getRegionInfo().getTable().isSystemTable() &&
        getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID) {
      modifiedFlushCheckInterval = SYSTEM_CACHE_FLUSH_INTERVAL;
    }
    if (modifiedFlushCheckInterval <= 0) { //disabled
      return false;
    }
    long now = EnvironmentEdgeManager.currentTime();
    //if we flushed in the recent past, we don't need to do again now
    if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) {
      return false;
    }
    //since we didn't flush in the recent past, flush now if certain conditions
    //are met. Return true on first such memstore hit.
    for (HStore s : stores.values()) {
      if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) {
        // we have an old enough edit in the memstore, flush
        whyFlush.append(s.toString() + " has an old edit so flush to free WALs");
        return true;
      }
    }
    return false;
  }

  /**
   * Flushing all stores.
   * @see #internalFlushcache(Collection, MonitoredTask, boolean, FlushLifeCycleTracker)
   */
  private FlushResult internalFlushcache(MonitoredTask status) throws IOException {
    return internalFlushcache(stores.values(), status, false, FlushLifeCycleTracker.DUMMY);
  }

  /**
   * Flushing given stores.
   * @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean, FlushLifeCycleTracker)
   */
  private FlushResultImpl internalFlushcache(Collection storesToFlush, MonitoredTask status,
      boolean writeFlushWalMarker, FlushLifeCycleTracker tracker) throws IOException {
    return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush, status,
      writeFlushWalMarker, tracker);
  }

  /**
   * Flush the memstore. Flushing the memstore is a little tricky. We have a lot of updates in the
   * memstore, all of which have also been written to the wal. We need to write those updates in the
   * memstore out to disk, while being able to process reads/writes as much as possible during the
   * flush operation.
   * 

   * This method may block for some time. Every time you call it, we up the regions sequence id even
   * if we don't flush; i.e. the returned region id will be at least one larger than the last edit
   * applied to this region. The returned id does not refer to an actual edit. The returned id can
   * be used for say installing a bulk loaded file just ahead of the last hfile that was the result
   * of this flush, etc.
   * @param wal Null if we're NOT to go via wal.
   * @param myseqid The seqid to use if wal is null writing out flush file.
   * @param storesToFlush The list of stores to flush.
   * @return object describing the flush's state
   * @throws IOException general io exceptions
   * @throws DroppedSnapshotException Thrown when replay of WAL is required.
   */
  protected FlushResultImpl internalFlushcache(WAL wal, long myseqid,
      Collection storesToFlush, MonitoredTask status, boolean writeFlushWalMarker,
      FlushLifeCycleTracker tracker) throws IOException {
    PrepareFlushResult result =
        internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker, tracker);
    if (result.result == null) {
      return internalFlushCacheAndCommit(wal, status, result, storesToFlush);
    } else {
      return result.result; // early exit due to failure from prepare stage
    }
  }

  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="DLS_DEAD_LOCAL_STORE",
      justification="FindBugs seems confused about trxId")
  protected PrepareFlushResult internalPrepareFlushCache(WAL wal, long myseqid,
      Collection storesToFlush, MonitoredTask status, boolean writeFlushWalMarker,
      FlushLifeCycleTracker tracker) throws IOException {
    if (this.rsServices != null && this.rsServices.isAborted()) {
      // Don't flush when server aborting, it's unsafe
      throw new IOException("Aborting flush because server is aborted...");
    }
    final long startTime = EnvironmentEdgeManager.currentTime();
    // If nothing to flush, return, but return with a valid unused sequenceId.
    // Its needed by bulk upload IIRC. It flushes until no edits in memory so it can insert a
    // bulk loaded file between memory and existing hfiles. It wants a good seqeunceId that belongs
    // to no other that it can use to associate with the bulk load. Hence this little dance below
    // to go get one.
    if (this.memStoreSizing.getDataSize() <= 0) {
      // Take an update lock so no edits can come into memory just yet.
      this.updatesLock.writeLock().lock();
      WriteEntry writeEntry = null;
      try {
        if (this.memStoreSizing.getDataSize() <= 0) {
          // Presume that if there are still no edits in the memstore, then there are no edits for
          // this region out in the WAL subsystem so no need to do any trickery clearing out
          // edits in the WAL sub-system. Up the sequence number so the resulting flush id is for
          // sure just beyond the last appended region edit and not associated with any edit
          // (useful as marker when bulk loading, etc.).
          if (wal != null) {
            writeEntry = mvcc.begin();
            long flushOpSeqId = writeEntry.getWriteNumber();
            FlushResultImpl flushResult =
                new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, flushOpSeqId,
                    "Nothing to flush", writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker));
            mvcc.completeAndWait(writeEntry);
            // Set to null so we don't complete it again down in finally block.
            writeEntry = null;
            return new PrepareFlushResult(flushResult, myseqid);
          } else {
            return new PrepareFlushResult(new FlushResultImpl(
              FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush", false), myseqid);
          }
        }
      } finally {
        if (writeEntry != null) {
          // If writeEntry is non-null, this operation failed; the mvcc transaction failed...
          // but complete it anyways so it doesn't block the mvcc queue.
          mvcc.complete(writeEntry);
        }
        this.updatesLock.writeLock().unlock();
      }
    }
    logFatLineOnFlush(storesToFlush, myseqid);
    // Stop updates while we snapshot the memstore of all of these regions' stores. We only have
    // to do this for a moment.  It is quick. We also set the memstore size to zero here before we
    // allow updates again so its value will represent the size of the updates received
    // during flush

    // We have to take an update lock during snapshot, or else a write could end up in both snapshot
    // and memstore (makes it difficult to do atomic rows then)
    status.setStatus("Obtaining lock to block concurrent updates");
    // block waiting for the lock for internal flush
    this.updatesLock.writeLock().lock();
    status.setStatus("Preparing flush snapshotting stores in " + getRegionInfo().getEncodedName());
    MemStoreSizing totalSizeOfFlushableStores = new NonThreadSafeMemStoreSizing();

    Map flushedFamilyNamesToSeq = new HashMap<>();
    for (HStore store : storesToFlush) {
      flushedFamilyNamesToSeq.put(store.getColumnFamilyDescriptor().getName(),
        store.preFlushSeqIDEstimation());
    }

    TreeMap storeFlushCtxs = new TreeMap<>(Bytes.BYTES_COMPARATOR);
    TreeMap> committedFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR);
    TreeMap storeFlushableSize = new TreeMap<>(Bytes.BYTES_COMPARATOR);
    // The sequence id of this flush operation which is used to log FlushMarker and pass to
    // createFlushContext to use as the store file's sequence id. It can be in advance of edits
    // still in the memstore, edits that are in other column families yet to be flushed.
    long flushOpSeqId = HConstants.NO_SEQNUM;
    // The max flushed sequence id after this flush operation completes. All edits in memstore
    // will be in advance of this sequence id.
    long flushedSeqId = HConstants.NO_SEQNUM;
    byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes();
    try {
      if (wal != null) {
        Long earliestUnflushedSequenceIdForTheRegion =
            wal.startCacheFlush(encodedRegionName, flushedFamilyNamesToSeq);
        if (earliestUnflushedSequenceIdForTheRegion == null) {
          // This should never happen. This is how startCacheFlush signals flush cannot proceed.
          String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing.";
          status.setStatus(msg);
          return new PrepareFlushResult(
              new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false),
              myseqid);
        }
        flushOpSeqId = getNextSequenceId(wal);
        // Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit
        flushedSeqId =
            earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM?
                flushOpSeqId: earliestUnflushedSequenceIdForTheRegion.longValue() - 1;
      } else {
        // use the provided sequence Id as WAL is not being used for this flush.
        flushedSeqId = flushOpSeqId = myseqid;
      }

      for (HStore s : storesToFlush) {
        storeFlushCtxs.put(s.getColumnFamilyDescriptor().getName(),
          s.createFlushContext(flushOpSeqId, tracker));
        // for writing stores to WAL
        committedFiles.put(s.getColumnFamilyDescriptor().getName(), null);
      }

      // write the snapshot start to WAL
      if (wal != null && !writestate.readOnly) {
        FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH,
            getRegionInfo(), flushOpSeqId, committedFiles);
        // No sync. Sync is below where no updates lock and we do FlushAction.COMMIT_FLUSH
        WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false,
            mvcc);
      }

      // Prepare flush (take a snapshot)
      storeFlushCtxs.forEach((name, flush) -> {
        MemStoreSize snapshotSize = flush.prepare();
        totalSizeOfFlushableStores.incMemStoreSize(snapshotSize);
        storeFlushableSize.put(name, snapshotSize);
      });
    } catch (IOException ex) {
      doAbortFlushToWAL(wal, flushOpSeqId, committedFiles);
      throw ex;
    } finally {
      this.updatesLock.writeLock().unlock();
    }
    String s = "Finished memstore snapshotting " + this + ", syncing WAL and waiting on mvcc, " +
        "flushsize=" + totalSizeOfFlushableStores;
    status.setStatus(s);
    doSyncOfUnflushedWALChanges(wal, getRegionInfo());
    return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
        flushOpSeqId, flushedSeqId, totalSizeOfFlushableStores);
  }

  /**
   * Utility method broken out of internalPrepareFlushCache so that method is smaller.
   */
  private void logFatLineOnFlush(Collection storesToFlush, long sequenceId) {
    if (!LOG.isInfoEnabled()) {
      return;
    }
    // Log a fat line detailing what is being flushed.
    StringBuilder perCfExtras = null;
    if (!isAllFamilies(storesToFlush)) {
      perCfExtras = new StringBuilder();
      for (HStore store: storesToFlush) {
        MemStoreSize mss = store.getFlushableSize();
        perCfExtras.append("; ").append(store.getColumnFamilyName());
        perCfExtras.append("={dataSize=")
            .append(StringUtils.byteDesc(mss.getDataSize()));
        perCfExtras.append(", heapSize=")
            .append(StringUtils.byteDesc(mss.getHeapSize()));
        perCfExtras.append(", offHeapSize=")
            .append(StringUtils.byteDesc(mss.getOffHeapSize()));
        perCfExtras.append("}");
      }
    }
    MemStoreSize mss = this.memStoreSizing.getMemStoreSize();
    LOG.info("Flushing " + this.getRegionInfo().getEncodedName() + " " +
        storesToFlush.size() + "/" + stores.size() + " column families," +
        " dataSize=" + StringUtils.byteDesc(mss.getDataSize()) +
        " heapSize=" + StringUtils.byteDesc(mss.getHeapSize()) +
        ((perCfExtras != null && perCfExtras.length() > 0)? perCfExtras.toString(): "") +
        ((wal != null) ? "" : "; WAL is null, using passed sequenceid=" + sequenceId));
  }

  private void doAbortFlushToWAL(final WAL wal, final long flushOpSeqId,
      final Map> committedFiles) {
    if (wal == null) return;
    try {
      FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
          getRegionInfo(), flushOpSeqId, committedFiles);
      WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false,
          mvcc);
    } catch (Throwable t) {
      LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL: {} in "
        + " region {}", StringUtils.stringifyException(t), this);
      // ignore this since we will be aborting the RS with DSE.
    }
    // we have called wal.startCacheFlush(), now we have to abort it
    wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
  }

  /**
   * Sync unflushed WAL changes. See HBASE-8208 for details
   */
  private static void doSyncOfUnflushedWALChanges(final WAL wal, final RegionInfo hri)
  throws IOException {
    if (wal == null) {
      return;
    }
    try {
      wal.sync(); // ensure that flush marker is sync'ed
    } catch (IOException ioe) {
      wal.abortCacheFlush(hri.getEncodedNameAsBytes());
      throw ioe;
    }
  }

  /**
   * @return True if passed Set is all families in the region.
   */
  private boolean isAllFamilies(Collection families) {
    return families == null || this.stores.size() == families.size();
  }

  /**
   * Writes a marker to WAL indicating a flush is requested but cannot be complete due to various
   * reasons. Ignores exceptions from WAL. Returns whether the write succeeded.
   * @param wal
   * @return whether WAL write was successful
   */
  private boolean writeFlushRequestMarkerToWAL(WAL wal, boolean writeFlushWalMarker) {
    if (writeFlushWalMarker && wal != null && !writestate.readOnly) {
      FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH,
        getRegionInfo(), -1, new TreeMap<>(Bytes.BYTES_COMPARATOR));
      try {
        WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true,
            mvcc);
        return true;
      } catch (IOException e) {
        LOG.warn(getRegionInfo().getEncodedName() + " : "
            + "Received exception while trying to write the flush request to wal", e);
      }
    }
    return false;
  }

  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
      justification="Intentional; notify is about completed flush")
  protected FlushResultImpl internalFlushCacheAndCommit(WAL wal, MonitoredTask status,
      PrepareFlushResult prepareResult, Collection storesToFlush) throws IOException {
    // prepare flush context is carried via PrepareFlushResult
    TreeMap storeFlushCtxs = prepareResult.storeFlushCtxs;
    TreeMap> committedFiles = prepareResult.committedFiles;
    long startTime = prepareResult.startTime;
    long flushOpSeqId = prepareResult.flushOpSeqId;
    long flushedSeqId = prepareResult.flushedSeqId;

    String s = "Flushing stores of " + this;
    status.setStatus(s);
    if (LOG.isTraceEnabled()) LOG.trace(s);

    // Any failure from here on out will be catastrophic requiring server
    // restart so wal content can be replayed and put back into the memstore.
    // Otherwise, the snapshot content while backed up in the wal, it will not
    // be part of the current running servers state.
    boolean compactionRequested = false;
    long flushedOutputFileSize = 0;
    try {
      // A.  Flush memstore to all the HStores.
      // Keep running vector of all store files that includes both old and the
      // just-made new flush store file. The new flushed file is still in the
      // tmp directory.

      for (StoreFlushContext flush : storeFlushCtxs.values()) {
        flush.flushCache(status);
      }

      // Switch snapshot (in memstore) -> new hfile (thus causing
      // all the store scanners to reset/reseek).
      for (Map.Entry flushEntry : storeFlushCtxs.entrySet()) {
        StoreFlushContext sfc = flushEntry.getValue();
        boolean needsCompaction = sfc.commit(status);
        if (needsCompaction) {
          compactionRequested = true;
        }
        byte[] storeName = flushEntry.getKey();
        List storeCommittedFiles = sfc.getCommittedFiles();
        committedFiles.put(storeName, storeCommittedFiles);
        // Flush committed no files, indicating flush is empty or flush was canceled
        if (storeCommittedFiles == null || storeCommittedFiles.isEmpty()) {
          MemStoreSize storeFlushableSize = prepareResult.storeFlushableSize.get(storeName);
          prepareResult.totalFlushableSize.decMemStoreSize(storeFlushableSize);
        }
        flushedOutputFileSize += sfc.getOutputFileSize();
      }
      storeFlushCtxs.clear();

      // Set down the memstore size by amount of flush.
      MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize();
      this.decrMemStoreSize(mss);

      // Increase the size of this Region for the purposes of quota. Noop if quotas are disabled.
      // During startup, quota manager may not be initialized yet.
      if (rsServices != null) {
        RegionServerSpaceQuotaManager quotaManager = rsServices.getRegionServerSpaceQuotaManager();
        if (quotaManager != null) {
          quotaManager.getRegionSizeStore().incrementRegionSize(
              this.getRegionInfo(), flushedOutputFileSize);
        }
      }

      if (wal != null) {
        // write flush marker to WAL. If fail, we should throw DroppedSnapshotException
        FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH,
          getRegionInfo(), flushOpSeqId, committedFiles);
        WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true,
            mvcc);
      }
    } catch (Throwable t) {
      // An exception here means that the snapshot was not persisted.
      // The wal needs to be replayed so its content is restored to memstore.
      // Currently, only a server restart will do this.
      // We used to only catch IOEs but its possible that we'd get other
      // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
      // all and sundry.
      if (wal != null) {
        try {
          FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
            getRegionInfo(), flushOpSeqId, committedFiles);
          WALUtil.writeFlushMarker(wal, this.replicationScope, getRegionInfo(), desc, false, mvcc);
        } catch (Throwable ex) {
          LOG.warn(getRegionInfo().getEncodedName() + " : "
              + "failed writing ABORT_FLUSH marker to WAL", ex);
          // ignore this since we will be aborting the RS with DSE.
        }
        wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
      }
      DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
        Bytes.toStringBinary(getRegionInfo().getRegionName()), t);
      status.abort("Flush failed: " + StringUtils.stringifyException(t));

      // Callers for flushcache() should catch DroppedSnapshotException and abort the region server.
      // However, since we may have the region read lock, we cannot call close(true) here since
      // we cannot promote to a write lock. Instead we are setting closing so that all other region
      // operations except for close will be rejected.
      this.closing.set(true);

      if (rsServices != null) {
        // This is a safeguard against the case where the caller fails to explicitly handle aborting
        rsServices.abort("Replay of WAL required. Forcing server shutdown", dse);
      }

      throw dse;
    }

    // If we get to here, the HStores have been written.
    if (wal != null) {
      wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes(), flushedSeqId);
    }

    // Record latest flush time
    for (HStore store: storesToFlush) {
      this.lastStoreFlushTimeMap.put(store, startTime);
    }

    this.maxFlushedSeqId = flushedSeqId;
    this.lastFlushOpSeqId = flushOpSeqId;

    // C. Finally notify anyone waiting on memstore to clear:
    // e.g. checkResources().
    synchronized (this) {
      notifyAll(); // FindBugs NN_NAKED_NOTIFY
    }

    long time = EnvironmentEdgeManager.currentTime() - startTime;
    MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize();
    long memstoresize = this.memStoreSizing.getMemStoreSize().getDataSize();
    String msg = "Finished flush of"
        + " dataSize ~" + StringUtils.byteDesc(mss.getDataSize()) + "/" + mss.getDataSize()
        + ", heapSize ~" + StringUtils.byteDesc(mss.getHeapSize()) + "/" + mss.getHeapSize()
        + ", currentSize=" + StringUtils.byteDesc(memstoresize) + "/" + memstoresize
        + " for " + this.getRegionInfo().getEncodedName() + " in " + time + "ms, sequenceid="
        + flushOpSeqId +  ", compaction requested=" + compactionRequested
        + ((wal == null) ? "; wal=null" : "");
    LOG.info(msg);
    status.setStatus(msg);

    if (rsServices != null && rsServices.getMetrics() != null) {
      rsServices.getMetrics().updateFlush(getTableDescriptor().getTableName().getNameAsString(),
          time,
          mss.getDataSize(), flushedOutputFileSize);
    }

    return new FlushResultImpl(compactionRequested ?
        FlushResult.Result.FLUSHED_COMPACTION_NEEDED :
          FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushOpSeqId);
  }

  /**
   * Method to safely get the next sequence number.
   * @return Next sequence number unassociated with any actual edit.
   * @throws IOException
   */
  @VisibleForTesting
  protected long getNextSequenceId(final WAL wal) throws IOException {
    WriteEntry we = mvcc.begin();
    mvcc.completeAndWait(we);
    return we.getWriteNumber();
  }

  //////////////////////////////////////////////////////////////////////////////
  // get() methods for client use.
  //////////////////////////////////////////////////////////////////////////////

  @Override
  public RegionScannerImpl getScanner(Scan scan) throws IOException {
   return getScanner(scan, null);
  }

  @Override
  public RegionScannerImpl getScanner(Scan scan, List additionalScanners)
      throws IOException {
    return getScanner(scan, additionalScanners, HConstants.NO_NONCE, HConstants.NO_NONCE);
  }

  private RegionScannerImpl getScanner(Scan scan, List additionalScanners,
      long nonceGroup, long nonce) throws IOException {
    startRegionOperation(Operation.SCAN);
    try {
      // Verify families are all valid
      if (!scan.hasFamilies()) {
        // Adding all families to scanner
        for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) {
          scan.addFamily(family);
        }
      } else {
        for (byte[] family : scan.getFamilyMap().keySet()) {
          checkFamily(family);
        }
      }
      return instantiateRegionScanner(scan, additionalScanners, nonceGroup, nonce);
    } finally {
      closeRegionOperation(Operation.SCAN);
    }
  }

  protected RegionScanner instantiateRegionScanner(Scan scan,
      List additionalScanners) throws IOException {
    return instantiateRegionScanner(scan, additionalScanners, HConstants.NO_NONCE,
      HConstants.NO_NONCE);
  }

  protected RegionScannerImpl instantiateRegionScanner(Scan scan,
      List additionalScanners, long nonceGroup, long nonce) throws IOException {
    if (scan.isReversed()) {
      if (scan.getFilter() != null) {
        scan.getFilter().setReversed(true);
      }
      return new ReversedRegionScannerImpl(scan, additionalScanners, this);
    }
    return new RegionScannerImpl(scan, additionalScanners, this, nonceGroup, nonce);
  }

  /**
   * Prepare a delete for a row mutation processor
   * @param delete The passed delete is modified by this method. WARNING!
   * @throws IOException
   */
  public void prepareDelete(Delete delete) throws IOException {
    // Check to see if this is a deleteRow insert
    if(delete.getFamilyCellMap().isEmpty()){
      for(byte [] family : this.htableDescriptor.getColumnFamilyNames()){
        // Don't eat the timestamp
        delete.addFamily(family, delete.getTimestamp());
      }
    } else {
      for(byte [] family : delete.getFamilyCellMap().keySet()) {
        if(family == null) {
          throw new NoSuchColumnFamilyException("Empty family is invalid");
        }
        checkFamily(family);
      }
    }
  }

  @Override
  public void delete(Delete delete) throws IOException {
    checkReadOnly();
    checkResources();
    startRegionOperation(Operation.DELETE);
    try {
      // All edits for the given row (across all column families) must happen atomically.
      doBatchMutate(delete);
    } finally {
      closeRegionOperation(Operation.DELETE);
    }
  }

  /**
   * Row needed by below method.
   */
  private static final byte [] FOR_UNIT_TESTS_ONLY = Bytes.toBytes("ForUnitTestsOnly");

  /**
   * This is used only by unit tests. Not required to be a public API.
   * @param familyMap map of family to edits for the given family.
   * @throws IOException
   */
  void delete(NavigableMap> familyMap,
      Durability durability) throws IOException {
    Delete delete = new Delete(FOR_UNIT_TESTS_ONLY);
    delete.setFamilyCellMap(familyMap);
    delete.setDurability(durability);
    doBatchMutate(delete);
  }

  /**
   * Set up correct timestamps in the KVs in Delete object.
   * 
Caller should have the row and region locks.
   * @param mutation
   * @param familyMap
   * @param byteNow
   * @throws IOException
   */
  public void prepareDeleteTimestamps(Mutation mutation, Map> familyMap,
      byte[] byteNow) throws IOException {
    for (Map.Entry> e : familyMap.entrySet()) {

      byte[] family = e.getKey();
      List cells = e.getValue();
      assert cells instanceof RandomAccess;

      Map kvCount = new TreeMap<>(Bytes.BYTES_COMPARATOR);
      int listSize = cells.size();
      for (int i=0; i < listSize; i++) {
        Cell cell = cells.get(i);
        //  Check if time is LATEST, change to time of most recent addition if so
        //  This is expensive.
        if (cell.getTimestamp() == HConstants.LATEST_TIMESTAMP
            && PrivateCellUtil.isDeleteType(cell)) {
          byte[] qual = CellUtil.cloneQualifier(cell);

          Integer count = kvCount.get(qual);
          if (count == null) {
            kvCount.put(qual, 1);
          } else {
            kvCount.put(qual, count + 1);
          }
          count = kvCount.get(qual);

          Get get = new Get(CellUtil.cloneRow(cell));
          get.setMaxVersions(count);
          get.addColumn(family, qual);
          if (coprocessorHost != null) {
            if (!coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell,
                byteNow, get)) {
              updateDeleteLatestVersionTimestamp(cell, get, count, byteNow);
            }
          } else {
            updateDeleteLatestVersionTimestamp(cell, get, count, byteNow);
          }
        } else {
          PrivateCellUtil.updateLatestStamp(cell, byteNow);
        }
      }
    }
  }

  void updateDeleteLatestVersionTimestamp(Cell cell, Get get, int count, byte[] byteNow)
      throws IOException {
    List result = get(get, false);

    if (result.size() < count) {
      // Nothing to delete
      PrivateCellUtil.updateLatestStamp(cell, byteNow);
      return;
    }
    if (result.size() > count) {
      throw new RuntimeException("Unexpected size: " + result.size());
    }
    Cell getCell = result.get(count - 1);
    PrivateCellUtil.setTimestamp(cell, getCell.getTimestamp());
  }

  @Override
  public void put(Put put) throws IOException {
    checkReadOnly();

    // Do a rough check that we have resources to accept a write.  The check is
    // 'rough' in that between the resource check and the call to obtain a
    // read lock, resources may run out.  For now, the thought is that this
    // will be extremely rare; we'll deal with it when it happens.
    checkResources();
    startRegionOperation(Operation.PUT);
    try {
      // All edits for the given row (across all column families) must happen atomically.
      doBatchMutate(put);
    } finally {
      closeRegionOperation(Operation.PUT);
    }
  }

  /**
   * Class that tracks the progress of a batch operations, accumulating status codes and tracking
   * the index at which processing is proceeding. These batch operations may get split into
   * mini-batches for processing.
   */
  private abstract static class BatchOperation {
    protected final T[] operations;
    protected final OperationStatus[] retCodeDetails;
    protected final WALEdit[] walEditsFromCoprocessors;
    // reference family cell maps directly so coprocessors can mutate them if desired
    protected final Map>[] familyCellMaps;

    protected final HRegion region;
    protected int nextIndexToProcess = 0;
    protected final ObservedExceptionsInBatch observedExceptions;
    //Durability of the batch (highest durability of all operations)
    protected Durability durability;
    protected boolean atomic = false;

    public BatchOperation(final HRegion region, T[] operations) {
      this.operations = operations;
      this.retCodeDetails = new OperationStatus[operations.length];
      Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN);
      this.walEditsFromCoprocessors = new WALEdit[operations.length];
      familyCellMaps = new Map[operations.length];

      this.region = region;
      observedExceptions = new ObservedExceptionsInBatch();
      durability = Durability.USE_DEFAULT;
    }

    /**
     * Visitor interface for batch operations
     */
    @FunctionalInterface
    public interface Visitor {
      /**
       * @param index operation index
       * @return If true continue visiting remaining entries, break otherwise
       */
      boolean visit(int index) throws IOException;
    }

    /**
     * Helper method for visiting pending/ all batch operations
     */
    public void visitBatchOperations(boolean pendingOnly, int lastIndexExclusive, Visitor visitor)
        throws IOException {
      assert lastIndexExclusive <= this.size();
      for (int i = nextIndexToProcess; i < lastIndexExclusive; i++) {
        if (!pendingOnly || isOperationPending(i)) {
          if (!visitor.visit(i)) {
            break;
          }
        }
      }
    }

    public abstract Mutation getMutation(int index);

    public abstract long getNonceGroup(int index);

    public abstract long getNonce(int index);

    /**
     * This method is potentially expensive and useful mostly for non-replay CP path.
     */
    public abstract Mutation[] getMutationsForCoprocs();

    public abstract boolean isInReplay();

    public abstract long getOrigLogSeqNum();

    public abstract void startRegionOperation() throws IOException;

    public abstract void closeRegionOperation() throws IOException;

    /**
     * Validates each mutation and prepares a batch for write. If necessary (non-replay case), runs
     * CP prePut()/ preDelete() hooks for all mutations in a batch. This is intended to operate on
     * entire batch and will be called from outside of class to check and prepare batch. This can
     * be implemented by calling helper method {@link #checkAndPrepareMutation(int, long)} in a
     * 'for' loop over mutations.
     */
    public abstract void checkAndPrepare() throws IOException;

    /**
     * Implement any Put request specific check and prepare logic here. Please refer to
     * {@link #checkAndPrepareMutation(Mutation, long)} for how its used.
     */
    protected abstract void checkAndPreparePut(final Put p) throws IOException;

    /**
     * If necessary, calls preBatchMutate() CP hook for a mini-batch and updates metrics, cell
     * count, tags and timestamp for all cells of all operations in a mini-batch.
     */
    public abstract void prepareMiniBatchOperations(MiniBatchOperationInProgress
        miniBatchOp, long timestamp, final List acquiredRowLocks) throws IOException;

    /**
     * Write mini-batch operations to MemStore
     */
    public abstract WriteEntry writeMiniBatchOperationsToMemStore(
        final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry)
        throws IOException;

    protected void writeMiniBatchOperationsToMemStore(
        final MiniBatchOperationInProgress miniBatchOp, final long writeNumber)
        throws IOException {
      MemStoreSizing memStoreAccounting = new NonThreadSafeMemStoreSizing();
      visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> {
        // We need to update the sequence id for following reasons.
        // 1) If the op is in replay mode, FSWALEntry#stampRegionSequenceId won't stamp sequence id.
        // 2) If no WAL, FSWALEntry won't be used
        // we use durability of the original mutation for the mutation passed by CP.
        if (isInReplay() || getMutation(index).getDurability() == Durability.SKIP_WAL) {
          region.updateSequenceId(familyCellMaps[index].values(), writeNumber);
        }
        applyFamilyMapToMemStore(familyCellMaps[index], memStoreAccounting);
        return true;
      });
      // update memStore size
      region.incMemStoreSize(memStoreAccounting.getDataSize(), memStoreAccounting.getHeapSize(),
        memStoreAccounting.getOffHeapSize(), memStoreAccounting.getCellsCount());
    }

    public boolean isDone() {
      return nextIndexToProcess == operations.length;
    }

    public int size() {
      return operations.length;
    }

    public boolean isOperationPending(int index) {
      return retCodeDetails[index].getOperationStatusCode() == OperationStatusCode.NOT_RUN;
    }

    public List getClusterIds() {
      assert size() != 0;
      return getMutation(0).getClusterIds();
    }

    boolean isAtomic() {
      return atomic;
    }

    /**
     * Helper method that checks and prepares only one mutation. This can be used to implement
     * {@link #checkAndPrepare()} for entire Batch.
     * NOTE: As CP prePut()/ preDelete() hooks may modify mutations, this method should be called
     * after prePut()/ preDelete() CP hooks are run for the mutation
     */
    protected void checkAndPrepareMutation(Mutation mutation, final long timestamp)
        throws IOException {
      region.checkRow(mutation.getRow(), "batchMutate");
      if (mutation instanceof Put) {
        // Check the families in the put. If bad, skip this one.
        checkAndPreparePut((Put) mutation);
        region.checkTimestamps(mutation.getFamilyCellMap(), timestamp);
      } else {
        region.prepareDelete((Delete) mutation);
      }
    }

    protected void checkAndPrepareMutation(int index, long timestamp) throws IOException {
      Mutation mutation = getMutation(index);
      try {
        this.checkAndPrepareMutation(mutation, timestamp);

        // store the family map reference to allow for mutations
        familyCellMaps[index] = mutation.getFamilyCellMap();
        // store durability for the batch (highest durability of all operations in the batch)
        Durability tmpDur = region.getEffectiveDurability(mutation.getDurability());
        if (tmpDur.ordinal() > durability.ordinal()) {
          durability = tmpDur;
        }
      } catch (NoSuchColumnFamilyException nscfe) {
        final String msg = "No such column family in batch mutation in region " + this;
        if (observedExceptions.hasSeenNoSuchFamily()) {
          LOG.warn(msg + nscfe.getMessage());
        } else {
          LOG.warn(msg, nscfe);
          observedExceptions.sawNoSuchFamily();
        }
        retCodeDetails[index] = new OperationStatus(
            OperationStatusCode.BAD_FAMILY, nscfe.getMessage());
        if (isAtomic()) { // fail, atomic means all or none
          throw nscfe;
        }
      } catch (FailedSanityCheckException fsce) {
        final String msg = "Batch Mutation did not pass sanity check in region " + this;
        if (observedExceptions.hasSeenFailedSanityCheck()) {
          LOG.warn(msg + fsce.getMessage());
        } else {
          LOG.warn(msg, fsce);
          observedExceptions.sawFailedSanityCheck();
        }
        retCodeDetails[index] = new OperationStatus(
            OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
        if (isAtomic()) {
          throw fsce;
        }
      } catch (WrongRegionException we) {
        final String msg = "Batch mutation had a row that does not belong to this region " + this;
        if (observedExceptions.hasSeenWrongRegion()) {
          LOG.warn(msg + we.getMessage());
        } else {
          LOG.warn(msg, we);
          observedExceptions.sawWrongRegion();
        }
        retCodeDetails[index] = new OperationStatus(
            OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage());
        if (isAtomic()) {
          throw we;
        }
      }
    }

    /**
     * Creates Mini-batch of all operations [nextIndexToProcess, lastIndexExclusive) for which
     * a row lock can be acquired. All mutations with locked rows are considered to be
     * In-progress operations and hence the name {@link MiniBatchOperationInProgress}. Mini batch
     * is window over {@link BatchOperation} and contains contiguous pending operations.
     *
     * @param acquiredRowLocks keeps track of rowLocks acquired.
     */
    public MiniBatchOperationInProgress lockRowsAndBuildMiniBatch(
        List acquiredRowLocks) throws IOException {
      int readyToWriteCount = 0;
      int lastIndexExclusive = 0;
      RowLock prevRowLock = null;
      for (; lastIndexExclusive < size(); lastIndexExclusive++) {
        // It reaches the miniBatchSize, stop here and process the miniBatch
        // This only applies to non-atomic batch operations.
        if (!isAtomic() && (readyToWriteCount == region.miniBatchSize)) {
          break;
        }

        if (!isOperationPending(lastIndexExclusive)) {
          continue;
        }

        // HBASE-19389 Limit concurrency of put with dense (hundreds) columns to avoid exhausting
        // RS handlers, covering both MutationBatchOperation and ReplayBatchOperation
        // The BAD_FAMILY/SANITY_CHECK_FAILURE cases are handled in checkAndPrepare phase and won't
        // pass the isOperationPending check
        Map> curFamilyCellMap =
            getMutation(lastIndexExclusive).getFamilyCellMap();
        try {
          // start the protector before acquiring row lock considering performance, and will finish
          // it when encountering exception
          region.storeHotnessProtector.start(curFamilyCellMap);
        } catch (RegionTooBusyException rtbe) {
          region.storeHotnessProtector.finish(curFamilyCellMap);
          if (isAtomic()) {
            throw rtbe;
          }
          retCodeDetails[lastIndexExclusive] =
              new OperationStatus(OperationStatusCode.STORE_TOO_BUSY, rtbe.getMessage());
          continue;
        }

        Mutation mutation = getMutation(lastIndexExclusive);
        // If we haven't got any rows in our batch, we should block to get the next one.
        RowLock rowLock = null;
        boolean throwException = false;
        try {
          // if atomic then get exclusive lock, else shared lock
          rowLock = region.getRowLockInternal(mutation.getRow(), !isAtomic(), prevRowLock);
        } catch (TimeoutIOException | InterruptedIOException e) {
          // NOTE: We will retry when other exceptions, but we should stop if we receive
          // TimeoutIOException or InterruptedIOException as operation has timed out or
          // interrupted respectively.
          throwException = true;
          throw e;
        } catch (IOException ioe) {
          LOG.warn("Failed getting lock, row={}, in region {}",
            Bytes.toStringBinary(mutation.getRow()), this, ioe);
          if (isAtomic()) { // fail, atomic means all or none
            throwException = true;
            throw ioe;
          }
        } catch (Throwable throwable) {
          throwException = true;
          throw throwable;
        } finally {
          if (throwException) {
            region.storeHotnessProtector.finish(curFamilyCellMap);
          }
        }
        if (rowLock == null) {
          // We failed to grab another lock
          if (isAtomic()) {
            region.storeHotnessProtector.finish(curFamilyCellMap);
            throw new IOException("Can't apply all operations atomically!");
          }
          break; // Stop acquiring more rows for this batch
        } else {
          if (rowLock != prevRowLock) {
            // It is a different row now, add this to the acquiredRowLocks and
            // set prevRowLock to the new returned rowLock
            acquiredRowLocks.add(rowLock);
            prevRowLock = rowLock;
          }
        }

        readyToWriteCount++;
      }
      return createMiniBatch(lastIndexExclusive, readyToWriteCount);
    }

    protected MiniBatchOperationInProgress createMiniBatch(final int lastIndexExclusive,
        final int readyToWriteCount) {
      return new MiniBatchOperationInProgress<>(getMutationsForCoprocs(), retCodeDetails,
          walEditsFromCoprocessors, nextIndexToProcess, lastIndexExclusive, readyToWriteCount);
    }

    /**
     * Builds separate WALEdit per nonce by applying input mutations. If WALEdits from CP are
     * present, they are merged to result WALEdit.
     */
    public List> buildWALEdits(
        final MiniBatchOperationInProgress miniBatchOp) throws IOException {
      List> walEdits = new ArrayList<>();

      visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), new Visitor() {
        private Pair curWALEditForNonce;

        @Override
        public boolean visit(int index) throws IOException {
          Mutation m = getMutation(index);
          // we use durability of the original mutation for the mutation passed by CP.
          if (region.getEffectiveDurability(m.getDurability()) == Durability.SKIP_WAL) {
            region.recordMutationWithoutWal(m.getFamilyCellMap());
            return true;
          }

          // the batch may contain multiple nonce keys (replay case). If so, write WALEdit for each.
          // Given how nonce keys are originally written, these should be contiguous.
          // They don't have to be, it will still work, just write more WALEdits than needed.
          long nonceGroup = getNonceGroup(index);
          long nonce = getNonce(index);
          if (curWALEditForNonce == null ||
              curWALEditForNonce.getFirst().getNonceGroup() != nonceGroup ||
              curWALEditForNonce.getFirst().getNonce() != nonce) {
            curWALEditForNonce = new Pair<>(new NonceKey(nonceGroup, nonce),
                new WALEdit(miniBatchOp.getCellCount(), isInReplay()));
            walEdits.add(curWALEditForNonce);
          }
          WALEdit walEdit = curWALEditForNonce.getSecond();

          // Add WAL edits from CPs.
          WALEdit fromCP = walEditsFromCoprocessors[index];
          if (fromCP != null) {
            for (Cell cell : fromCP.getCells()) {
              walEdit.add(cell);
            }
          }
          walEdit.add(familyCellMaps[index]);

          return true;
        }
      });
      return walEdits;
    }

    /**
     * This method completes mini-batch operations by calling postBatchMutate() CP hook (if
     * required) and completing mvcc.
     */
    public void completeMiniBatchOperations(
        final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry)
        throws IOException {
      if (writeEntry != null) {
        region.mvcc.completeAndWait(writeEntry);
      }
    }

    public void doPostOpCleanupForMiniBatch(
        final MiniBatchOperationInProgress miniBatchOp, final WALEdit walEdit,
        boolean success) throws IOException {
      doFinishHotnessProtector(miniBatchOp);
    }

    private void doFinishHotnessProtector(
        final MiniBatchOperationInProgress miniBatchOp) {
      // check and return if the protector is not enabled
      if (!region.storeHotnessProtector.isEnable()) {
        return;
      }
      // miniBatchOp is null, if and only if lockRowsAndBuildMiniBatch throwing exception.
      // This case was handled.
      if (miniBatchOp == null) {
        return;
      }

      final int finalLastIndexExclusive = miniBatchOp.getLastIndexExclusive();

      for (int i = nextIndexToProcess; i < finalLastIndexExclusive; i++) {
        switch (retCodeDetails[i].getOperationStatusCode()) {
          case SUCCESS:
          case FAILURE:
            region.storeHotnessProtector.finish(getMutation(i).getFamilyCellMap());
            break;
          default:
            // do nothing
            // We won't start the protector for NOT_RUN/BAD_FAMILY/SANITY_CHECK_FAILURE and the
            // STORE_TOO_BUSY case is handled in StoreHotnessProtector#start
            break;
        }
      }
    }

    /**
     * Atomically apply the given map of family->edits to the memstore.
     * This handles the consistency control on its own, but the caller
     * should already have locked updatesLock.readLock(). This also does
     * not check the families for validity.
     *
     * @param familyMap Map of Cells by family
     */
    protected void applyFamilyMapToMemStore(Map> familyMap,
        MemStoreSizing memstoreAccounting) throws IOException {
      for (Map.Entry> e : familyMap.entrySet()) {
        byte[] family = e.getKey();
        List cells = e.getValue();
        assert cells instanceof RandomAccess;
        region.applyToMemStore(region.getStore(family), cells, false, memstoreAccounting);
      }
    }
  }


  /**
   * Batch of mutation operations. Base class is shared with {@link ReplayBatchOperation} as most
   * of the logic is same.
   */
  static class MutationBatchOperation extends BatchOperation {
    private long nonceGroup;
    private long nonce;
    public MutationBatchOperation(final HRegion region, Mutation[] operations, boolean atomic,
        long nonceGroup, long nonce) {
      super(region, operations);
      this.atomic = atomic;
      this.nonceGroup = nonceGroup;
      this.nonce = nonce;
    }

    @Override
    public Mutation getMutation(int index) {
      return this.operations[index];
    }

    @Override
    public long getNonceGroup(int index) {
      return nonceGroup;
    }

    @Override
    public long getNonce(int index) {
      return nonce;
    }

    @Override
    public Mutation[] getMutationsForCoprocs() {
      return this.operations;
    }

    @Override
    public boolean isInReplay() {
      return false;
    }

    @Override
    public long getOrigLogSeqNum() {
      return SequenceId.NO_SEQUENCE_ID;
    }

    @Override
    public void startRegionOperation() throws IOException {
      region.startRegionOperation(Operation.BATCH_MUTATE);
    }

    @Override
    public void closeRegionOperation() throws IOException {
      region.closeRegionOperation(Operation.BATCH_MUTATE);
    }

    @Override
    public void checkAndPreparePut(Put p) throws IOException {
      region.checkFamilies(p.getFamilyCellMap().keySet());
    }

    @Override
    public void checkAndPrepare() throws IOException {
      final int[] metrics = {0, 0}; // index 0: puts, index 1: deletes
      visitBatchOperations(true, this.size(), new Visitor() {
        private long now = EnvironmentEdgeManager.currentTime();
        private WALEdit walEdit;
        @Override
        public boolean visit(int index) throws IOException {
          // Run coprocessor pre hook outside of locks to avoid deadlock
          if (region.coprocessorHost != null) {
            if (walEdit == null) {
              walEdit = new WALEdit();
            }
            callPreMutateCPHook(index, walEdit, metrics);
            if (!walEdit.isEmpty()) {
              walEditsFromCoprocessors[index] = walEdit;
              walEdit = null;
            }
          }
          if (isOperationPending(index)) {
            // TODO: Currently validation is done with current time before acquiring locks and
            // updates are done with different timestamps after acquiring locks. This behavior is
            // inherited from the code prior to this change. Can this be changed?
            checkAndPrepareMutation(index, now);
          }
          return true;
        }
      });

      // FIXME: we may update metrics twice! here for all operations bypassed by CP and later in
      // normal processing.
      // Update metrics in same way as it is done when we go the normal processing route (we now
      // update general metrics though a Coprocessor did the work).
      if (region.metricsRegion != null) {
        if (metrics[0] > 0) {
          // There were some Puts in the batch.
          region.metricsRegion.updatePut();
        }
        if (metrics[1] > 0) {
          // There were some Deletes in the batch.
          region.metricsRegion.updateDelete();
        }
      }
    }

    @Override
    public void prepareMiniBatchOperations(MiniBatchOperationInProgress miniBatchOp,
        long timestamp, final List acquiredRowLocks) throws IOException {
      byte[] byteTS = Bytes.toBytes(timestamp);
      visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> {
        Mutation mutation = getMutation(index);
        if (mutation instanceof Put) {
          region.updateCellTimestamps(familyCellMaps[index].values(), byteTS);
          miniBatchOp.incrementNumOfPuts();
        } else {
          region.prepareDeleteTimestamps(mutation, familyCellMaps[index], byteTS);
          miniBatchOp.incrementNumOfDeletes();
        }
        region.rewriteCellTags(familyCellMaps[index], mutation);

        // update cell count
        if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) {
          for (List cells : mutation.getFamilyCellMap().values()) {
            miniBatchOp.addCellCount(cells.size());
          }
        }

        WALEdit fromCP = walEditsFromCoprocessors[index];
        if (fromCP != null) {
          miniBatchOp.addCellCount(fromCP.size());
        }
        return true;
      });

      if (region.coprocessorHost != null) {
        // calling the pre CP hook for batch mutation
        region.coprocessorHost.preBatchMutate(miniBatchOp);
        checkAndMergeCPMutations(miniBatchOp, acquiredRowLocks, timestamp);
      }
    }

    @Override
    public List> buildWALEdits(final MiniBatchOperationInProgress
        miniBatchOp) throws IOException {
      List> walEdits = super.buildWALEdits(miniBatchOp);
      // for MutationBatchOperation, more than one nonce is not allowed
      if (walEdits.size() > 1) {
        throw new IOException("Found multiple nonce keys per batch!");
      }
      return walEdits;
    }

    @Override
    public WriteEntry writeMiniBatchOperationsToMemStore(
        final MiniBatchOperationInProgress miniBatchOp, @Nullable WriteEntry writeEntry)
        throws IOException {
      if (writeEntry == null) {
        writeEntry = region.mvcc.begin();
      }
      super.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry.getWriteNumber());
      return writeEntry;
    }

    @Override
    public void completeMiniBatchOperations(
        final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry)
        throws IOException {
      // TODO: can it be done after completing mvcc?
      // calling the post CP hook for batch mutation
      if (region.coprocessorHost != null) {
        region.coprocessorHost.postBatchMutate(miniBatchOp);
      }
      super.completeMiniBatchOperations(miniBatchOp, writeEntry);
    }

    @Override
    public void doPostOpCleanupForMiniBatch(MiniBatchOperationInProgress miniBatchOp,
        final WALEdit walEdit, boolean success) throws IOException {

      super.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, success);
      if (miniBatchOp != null) {
        // synced so that the coprocessor contract is adhered to.
        if (region.coprocessorHost != null) {
          visitBatchOperations(false, miniBatchOp.getLastIndexExclusive(), (int i) -> {
            // only for successful puts
            if (retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.SUCCESS) {
              Mutation m = getMutation(i);
              if (m instanceof Put) {
                region.coprocessorHost.postPut((Put) m, walEdit, m.getDurability());
              } else {
                region.coprocessorHost.postDelete((Delete) m, walEdit, m.getDurability());
              }
            }
            return true;
          });
        }

        // See if the column families were consistent through the whole thing.
        // if they were then keep them. If they were not then pass a null.
        // null will be treated as unknown.
        // Total time taken might be involving Puts and Deletes.
        // Split the time for puts and deletes based on the total number of Puts and Deletes.
        if (region.metricsRegion != null) {
          if (miniBatchOp.getNumOfPuts() > 0) {
            // There were some Puts in the batch.
            region.metricsRegion.updatePut();
          }
          if (miniBatchOp.getNumOfDeletes() > 0) {
            // There were some Deletes in the batch.
            region.metricsRegion.updateDelete();
          }
        }
      }

      if (region.coprocessorHost != null) {
        // call the coprocessor hook to do any finalization steps after the put is done
        region.coprocessorHost.postBatchMutateIndispensably(
            miniBatchOp != null ? miniBatchOp : createMiniBatch(size(), 0), success);
      }
    }

    /**
     * Runs prePut/ preDelete coprocessor hook for input mutation in a batch
     * @param metrics Array of 2 ints. index 0: count of puts and index 1: count of deletes
     */
    private void callPreMutateCPHook(int index, final WALEdit walEdit, final int[] metrics)
        throws IOException {
      Mutation m = getMutation(index);
      if (m instanceof Put) {
        if (region.coprocessorHost.prePut((Put) m, walEdit, m.getDurability())) {
          // pre hook says skip this Put
          // mark as success and skip in doMiniBatchMutation
          metrics[0]++;
          retCodeDetails[index] = OperationStatus.SUCCESS;
        }
      } else if (m instanceof Delete) {
        Delete curDel = (Delete) m;
        if (curDel.getFamilyCellMap().isEmpty()) {
          // handle deleting a row case
          // TODO: prepareDelete() has been called twice, before and after preDelete() CP hook.
          // Can this be avoided?
          region.prepareDelete(curDel);
        }
        if (region.coprocessorHost.preDelete(curDel, walEdit, m.getDurability())) {
          // pre hook says skip this Delete
          // mark as success and skip in doMiniBatchMutation
          metrics[1]++;
          retCodeDetails[index] = OperationStatus.SUCCESS;
        }
      } else {
        String msg = "Put/Delete mutations only supported in a batch";
        // In case of passing Append mutations along with the Puts and Deletes in batchMutate
        // mark the operation return code as failure so that it will not be considered in
        // the doMiniBatchMutation
        retCodeDetails[index] = new OperationStatus(OperationStatusCode.FAILURE, msg);

        if (isAtomic()) { // fail, atomic means all or none
          throw new IOException(msg);
        }
      }
    }

    private void checkAndMergeCPMutations(final MiniBatchOperationInProgress miniBatchOp,
        final List acquiredRowLocks, final long timestamp) throws IOException {
      visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), (int i) -> {
        // we pass (i - firstIndex) below since the call expects a relative index
        Mutation[] cpMutations = miniBatchOp.getOperationsFromCoprocessors(i - nextIndexToProcess);
        if (cpMutations == null) {
          return true;
        }
        // Else Coprocessor added more Mutations corresponding to the Mutation at this index.
        Mutation mutation = getMutation(i);
        for (Mutation cpMutation : cpMutations) {
          this.checkAndPrepareMutation(cpMutation, timestamp);

          // Acquire row locks. If not, the whole batch will fail.
          acquiredRowLocks.add(region.getRowLockInternal(cpMutation.getRow(), true, null));

          // Returned mutations from coprocessor correspond to the Mutation at index i. We can
          // directly add the cells from those mutations to the familyMaps of this mutation.
          Map> cpFamilyMap = cpMutation.getFamilyCellMap();
          region.rewriteCellTags(cpFamilyMap, mutation);
          // will get added to the memStore later
          mergeFamilyMaps(familyCellMaps[i], cpFamilyMap);

          // The durability of returned mutation is replaced by the corresponding mutation.
          // If the corresponding mutation contains the SKIP_WAL, we shouldn't count the
          // cells of returned mutation.
          if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) {
            for (List cells : cpFamilyMap.values()) {
              miniBatchOp.addCellCount(cells.size());
            }
          }
        }
        return true;
      });
    }

    private void mergeFamilyMaps(Map> familyMap,
        Map> toBeMerged) {
      for (Map.Entry> entry : toBeMerged.entrySet()) {
        List cells = familyMap.get(entry.getKey());
        if (cells == null) {
          familyMap.put(entry.getKey(), entry.getValue());
        } else {
          cells.addAll(entry.getValue());
        }
      }
    }
  }

  /**
   * Batch of mutations for replay. Base class is shared with {@link MutationBatchOperation} as most
   * of the logic is same.
   */
  static class ReplayBatchOperation extends BatchOperation {
    private long origLogSeqNum = 0;
    public ReplayBatchOperation(final HRegion region, MutationReplay[] operations,
        long origLogSeqNum) {
      super(region, operations);
      this.origLogSeqNum = origLogSeqNum;
    }

    @Override
    public Mutation getMutation(int index) {
      return this.operations[index].mutation;
    }

    @Override
    public long getNonceGroup(int index) {
      return this.operations[index].nonceGroup;
    }

    @Override
    public long getNonce(int index) {
      return this.operations[index].nonce;
    }

    @Override
    public Mutation[] getMutationsForCoprocs() {
      return null;
    }

    @Override
    public boolean isInReplay() {
      return true;
    }

    @Override
    public long getOrigLogSeqNum() {
      return this.origLogSeqNum;
    }

    @Override
    public void startRegionOperation() throws IOException {
      region.startRegionOperation(Operation.REPLAY_BATCH_MUTATE);
    }

    @Override
    public void closeRegionOperation() throws IOException {
      region.closeRegionOperation(Operation.REPLAY_BATCH_MUTATE);
    }

    /**
     * During replay, there could exist column families which are removed between region server
     * failure and replay
     */
    @Override
    protected void checkAndPreparePut(Put p) throws IOException {
      Map> familyCellMap = p.getFamilyCellMap();
      List nonExistentList = null;
      for (byte[] family : familyCellMap.keySet()) {
        if (!region.htableDescriptor.hasColumnFamily(family)) {
          if (nonExistentList == null) {
            nonExistentList = new ArrayList<>();
          }
          nonExistentList.add(family);
        }
      }
      if (nonExistentList != null) {
        for (byte[] family : nonExistentList) {
          // Perhaps schema was changed between crash and replay
          LOG.info("No family for {} omit from reply in region {}.", Bytes.toString(family), this);
          familyCellMap.remove(family);
        }
      }
    }

    @Override
    public void checkAndPrepare() throws IOException {
      long now = EnvironmentEdgeManager.currentTime();
      visitBatchOperations(true, this.size(), (int index) -> {
        checkAndPrepareMutation(index, now);
        return true;
      });
    }

    @Override
    public void prepareMiniBatchOperations(MiniBatchOperationInProgress miniBatchOp,
        long timestamp, final List acquiredRowLocks) throws IOException {
      visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> {
        // update cell count
        for (List cells : getMutation(index).getFamilyCellMap().values()) {
          miniBatchOp.addCellCount(cells.size());
        }
        return true;
      });
    }

    @Override
    public WriteEntry writeMiniBatchOperationsToMemStore(
        final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry)
        throws IOException {
      super.writeMiniBatchOperationsToMemStore(miniBatchOp, getOrigLogSeqNum());
      return writeEntry;
    }

    @Override
    public void completeMiniBatchOperations(
        final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry)
        throws IOException {
      super.completeMiniBatchOperations(miniBatchOp, writeEntry);
      region.mvcc.advanceTo(getOrigLogSeqNum());
    }
  }

  public OperationStatus[] batchMutate(Mutation[] mutations, long nonceGroup, long nonce)
      throws IOException {
    return batchMutate(mutations, false, nonceGroup, nonce);
  }

  public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, long nonceGroup,
      long nonce) throws IOException {
    // As it stands, this is used for 3 things
    //  * batchMutate with single mutation - put/delete, separate or from checkAndMutate.
    //  * coprocessor calls (see ex. BulkDeleteEndpoint).
    // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd...
    return batchMutate(new MutationBatchOperation(this, mutations, atomic, nonceGroup, nonce));
  }

  @Override
  public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException {
    return batchMutate(mutations, HConstants.NO_NONCE, HConstants.NO_NONCE);
  }

  public OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId)
      throws IOException {
    if (!RegionReplicaUtil.isDefaultReplica(getRegionInfo())
        && replaySeqId < lastReplayedOpenRegionSeqId) {
      // if it is a secondary replica we should ignore these entries silently
      // since they are coming out of order
      if (LOG.isTraceEnabled()) {
        LOG.trace(getRegionInfo().getEncodedName() + " : "
          + "Skipping " + mutations.length + " mutations with replaySeqId=" + replaySeqId
          + " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId);
        for (MutationReplay mut : mutations) {
          LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation);
        }
      }

      OperationStatus[] statuses = new OperationStatus[mutations.length];
      for (int i = 0; i < statuses.length; i++) {
        statuses[i] = OperationStatus.SUCCESS;
      }
      return statuses;
    }
    return batchMutate(new ReplayBatchOperation(this, mutations, replaySeqId));
  }

  /**
   * Perform a batch of mutations.
   *
   * It supports only Put and Delete mutations and will ignore other types passed. Operations in
   * a batch are stored with highest durability specified of for all operations in a batch,
   * except for {@link Durability#SKIP_WAL}.
   *
   * 
This function is called from {@link #batchReplay(WALSplitUtil.MutationReplay[], long)} with
   * {@link ReplayBatchOperation} instance and {@link #batchMutate(Mutation[], long, long)} with
   * {@link MutationBatchOperation} instance as an argument. As the processing of replay batch
   * and mutation batch is very similar, lot of code is shared by providing generic methods in
   * base class {@link BatchOperation}. The logic for this method and
   * {@link #doMiniBatchMutate(BatchOperation)} is implemented using methods in base class which
   * are overridden by derived classes to implement special behavior.
   *
   * @param batchOp contains the list of mutations
   * @return an array of OperationStatus which internally contains the
   *         OperationStatusCode and the exceptionMessage if any.
   * @throws IOException if an IO problem is encountered
   */
  OperationStatus[] batchMutate(BatchOperation batchOp) throws IOException {
    boolean initialized = false;
    batchOp.startRegionOperation();
    try {
      while (!batchOp.isDone()) {
        if (!batchOp.isInReplay()) {
          checkReadOnly();
        }
        checkResources();

        if (!initialized) {
          this.writeRequestsCount.add(batchOp.size());
          // validate and prepare batch for write, for MutationBatchOperation it also calls CP
          // prePut()/ preDelete() hooks
          batchOp.checkAndPrepare();
          initialized = true;
        }
        doMiniBatchMutate(batchOp);
        requestFlushIfNeeded();
      }
    } finally {
      if (rsServices != null && rsServices.getMetrics() != null) {
        rsServices.getMetrics().updateWriteQueryMeter(this.htableDescriptor.
          getTableName(), batchOp.size());
      }
      batchOp.closeRegionOperation();
    }
    return batchOp.retCodeDetails;
  }

  /**
   * Called to do a piece of the batch that came in to {@link #batchMutate(Mutation[], long, long)}
   * In here we also handle replay of edits on region recover. Also gets change in size brought
   * about by applying {@code batchOp}.
   */
  private void doMiniBatchMutate(BatchOperation batchOp) throws IOException {
    boolean success = false;
    WALEdit walEdit = null;
    WriteEntry writeEntry = null;
    boolean locked = false;
    // We try to set up a batch in the range [batchOp.nextIndexToProcess,lastIndexExclusive)
    MiniBatchOperationInProgress miniBatchOp = null;
    /** Keep track of the locks we hold so we can release them in finally clause */
    List acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.size());
    try {
      // STEP 1. Try to acquire as many locks as we can and build mini-batch of operations with
      // locked rows
      miniBatchOp = batchOp.lockRowsAndBuildMiniBatch(acquiredRowLocks);

      // We've now grabbed as many mutations off the list as we can
      // Ensure we acquire at least one.
      if (miniBatchOp.getReadyToWriteCount() <= 0) {
        // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily?
        return;
      }

      lock(this.updatesLock.readLock(), miniBatchOp.getReadyToWriteCount());
      locked = true;

      // STEP 2. Update mini batch of all operations in progress with  LATEST_TIMESTAMP timestamp
      // We should record the timestamp only after we have acquired the rowLock,
      // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp
      long now = EnvironmentEdgeManager.currentTime();
      batchOp.prepareMiniBatchOperations(miniBatchOp, now, acquiredRowLocks);

      // STEP 3. Build WAL edit
      List> walEdits = batchOp.buildWALEdits(miniBatchOp);

      // STEP 4. Append the WALEdits to WAL and sync.
      for(Iterator> it = walEdits.iterator(); it.hasNext();) {
        Pair nonceKeyWALEditPair = it.next();
        walEdit = nonceKeyWALEditPair.getSecond();
        NonceKey nonceKey = nonceKeyWALEditPair.getFirst();

        if (walEdit != null && !walEdit.isEmpty()) {
          writeEntry = doWALAppend(walEdit, batchOp.durability, batchOp.getClusterIds(), now,
              nonceKey.getNonceGroup(), nonceKey.getNonce(), batchOp.getOrigLogSeqNum());
        }

        // Complete mvcc for all but last writeEntry (for replay case)
        if (it.hasNext() && writeEntry != null) {
          mvcc.complete(writeEntry);
          writeEntry = null;
        }
      }

      // STEP 5. Write back to memStore
      // NOTE: writeEntry can be null here
      writeEntry = batchOp.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry);

      // STEP 6. Complete MiniBatchOperations: If required calls postBatchMutate() CP hook and
      // complete mvcc for last writeEntry
      batchOp.completeMiniBatchOperations(miniBatchOp, writeEntry);
      writeEntry = null;
      success = true;
    } finally {
      // Call complete rather than completeAndWait because we probably had error if walKey != null
      if (writeEntry != null) mvcc.complete(writeEntry);

      if (locked) {
        this.updatesLock.readLock().unlock();
      }
      releaseRowLocks(acquiredRowLocks);

      final int finalLastIndexExclusive =
          miniBatchOp != null ? miniBatchOp.getLastIndexExclusive() : batchOp.size();
      final boolean finalSuccess = success;
      batchOp.visitBatchOperations(true, finalLastIndexExclusive, (int i) -> {
        batchOp.retCodeDetails[i] =
            finalSuccess ? OperationStatus.SUCCESS : OperationStatus.FAILURE;
        return true;
      });

      batchOp.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, finalSuccess);

      batchOp.nextIndexToProcess = finalLastIndexExclusive;
    }
  }

  /**
   * Returns effective durability from the passed durability and
   * the table descriptor.
   */
  protected Durability getEffectiveDurability(Durability d) {
    return d == Durability.USE_DEFAULT ? this.regionDurability : d;
  }

  @Override
  public boolean checkAndMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op,
    ByteArrayComparable comparator, TimeRange timeRange, Mutation mutation) throws IOException {
    return doCheckAndRowMutate(row, family, qualifier, op, comparator, null, timeRange, null,
      mutation);
  }

  @Override
  public boolean checkAndMutate(byte[] row, Filter filter, TimeRange timeRange, Mutation mutation)
    throws IOException {
    return doCheckAndRowMutate(row, null, null, null, null, filter, timeRange, null, mutation);
  }

  @Override
  public boolean checkAndRowMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op,
    ByteArrayComparable comparator, TimeRange timeRange, RowMutations rm) throws IOException {
    return doCheckAndRowMutate(row, family, qualifier, op, comparator, null, timeRange, rm, null);
  }

  @Override
  public boolean checkAndRowMutate(byte[] row, Filter filter, TimeRange timeRange, RowMutations rm)
    throws IOException {
    return doCheckAndRowMutate(row, null, null, null, null, filter, timeRange, rm, null);
  }

  /**
   * checkAndMutate and checkAndRowMutate are 90% the same. Rather than copy/paste, below has
   * switches in the few places where there is deviation.
   */
  private boolean doCheckAndRowMutate(byte[] row, byte[] family, byte[] qualifier,
    CompareOperator op, ByteArrayComparable comparator, Filter filter, TimeRange timeRange,
    RowMutations rowMutations, Mutation mutation)
  throws IOException {
    // Could do the below checks but seems wacky with two callers only. Just comment out for now.
    // One caller passes a Mutation, the other passes RowMutation. Presume all good so we don't
    // need these commented out checks.
    // if (rowMutations == null && mutation == null) throw new DoNotRetryIOException("Both null");
    // if (rowMutations != null && mutation != null) throw new DoNotRetryIOException("Both set");
    if (mutation != null) {
      checkMutationType(mutation);
      checkRow(mutation, row);
    } else {
      checkRow(rowMutations, row);
    }
    checkReadOnly();
    // TODO, add check for value length also move this check to the client
    checkResources();
    startRegionOperation();
    try {
      Get get = new Get(row);
      if (family != null) {
        checkFamily(family);
        get.addColumn(family, qualifier);
      }
      if (filter != null) {
        get.setFilter(filter);
      }
      if (timeRange != null) {
        get.setTimeRange(timeRange.getMin(), timeRange.getMax());
      }
      // Lock row - note that doBatchMutate will relock this row if called
      checkRow(row, "doCheckAndRowMutate");
      RowLock rowLock = getRowLockInternal(get.getRow(), false, null);
      try {
        if (mutation != null && this.getCoprocessorHost() != null) {
          // Call coprocessor.
          Boolean processed = null;
          if (mutation instanceof Put) {
            if (filter != null) {
              processed = this.getCoprocessorHost()
                .preCheckAndPutAfterRowLock(row, filter, (Put) mutation);
            } else {
              processed = this.getCoprocessorHost()
                .preCheckAndPutAfterRowLock(row, family, qualifier, op, comparator,
                  (Put) mutation);
            }
          } else if (mutation instanceof Delete) {
            if (filter != null) {
              processed = this.getCoprocessorHost()
                .preCheckAndDeleteAfterRowLock(row, filter, (Delete) mutation);
            } else {
              processed = this.getCoprocessorHost()
                .preCheckAndDeleteAfterRowLock(row, family, qualifier, op, comparator,
                  (Delete) mutation);
            }
          }
          if (processed != null) {
            return processed;
          }
        }
        // NOTE: We used to wait here until mvcc caught up:  mvcc.await();
        // Supposition is that now all changes are done under row locks, then when we go to read,
        // we'll get the latest on this row.
        List result = get(get, false);
        boolean matches = false;
        long cellTs = 0;
        if (filter != null) {
          if (!result.isEmpty()) {
            matches = true;
            cellTs = result.get(0).getTimestamp();
          }
        } else {
          boolean valueIsNull = comparator.getValue() == null || comparator.getValue().length == 0;
          if (result.isEmpty() && valueIsNull) {
            matches = true;
          } else if (result.size() > 0 && result.get(0).getValueLength() == 0 && valueIsNull) {
            matches = true;
            cellTs = result.get(0).getTimestamp();
          } else if (result.size() == 1 && !valueIsNull) {
            Cell kv = result.get(0);
            cellTs = kv.getTimestamp();
            int compareResult = PrivateCellUtil.compareValue(kv, comparator);
            matches = matches(op, compareResult);
          }
        }

        // If matches put the new put or delete the new delete
        if (matches) {
          // We have acquired the row lock already. If the system clock is NOT monotonically
          // non-decreasing (see HBASE-14070) we should make sure that the mutation has a
          // larger timestamp than what was observed via Get. doBatchMutate already does this, but
          // there is no way to pass the cellTs. See HBASE-14054.
          long now = EnvironmentEdgeManager.currentTime();
          long ts = Math.max(now, cellTs); // ensure write is not eclipsed
          byte[] byteTs = Bytes.toBytes(ts);
          if (mutation != null) {
            if (mutation instanceof Put) {
              updateCellTimestamps(mutation.getFamilyCellMap().values(), byteTs);
            }
            // And else 'delete' is not needed since it already does a second get, and sets the
            // timestamp from get (see prepareDeleteTimestamps).
          } else {
            for (Mutation m: rowMutations.getMutations()) {
              if (m instanceof Put) {
                updateCellTimestamps(m.getFamilyCellMap().values(), byteTs);
              }
            }
            // And else 'delete' is not needed since it already does a second get, and sets the
            // timestamp from get (see prepareDeleteTimestamps).
          }
          // All edits for the given row (across all column families) must happen atomically.
          if (mutation != null) {
            doBatchMutate(mutation);
          } else {
            mutateRow(rowMutations);
          }
          this.checkAndMutateChecksPassed.increment();
          return true;
        }
        this.checkAndMutateChecksFailed.increment();
        return false;
      } finally {
        rowLock.release();
      }
    } finally {
      closeRegionOperation();
    }
  }

  private void checkMutationType(final Mutation mutation)
  throws DoNotRetryIOException {
    boolean isPut = mutation instanceof Put;
    if (!isPut && !(mutation instanceof Delete)) {
      throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must be Put or Delete");
    }
  }

  private void checkRow(final Row action, final byte[] row)
    throws DoNotRetryIOException {
    if (!Bytes.equals(row, action.getRow())) {
      throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's getRow must match");
    }
  }

  private boolean matches(final CompareOperator op, final int compareResult) {
    boolean matches = false;
    switch (op) {
      case LESS:
        matches = compareResult < 0;
        break;
      case LESS_OR_EQUAL:
        matches = compareResult <= 0;
        break;
      case EQUAL:
        matches = compareResult == 0;
        break;
      case NOT_EQUAL:
        matches = compareResult != 0;
        break;
      case GREATER_OR_EQUAL:
        matches = compareResult >= 0;
        break;
      case GREATER:
        matches = compareResult > 0;
        break;
      default:
        throw new RuntimeException("Unknown Compare op " + op.name());
    }
    return matches;
  }


  private void doBatchMutate(Mutation mutation) throws IOException {
    // Currently this is only called for puts and deletes, so no nonces.
    OperationStatus[] batchMutate = this.batchMutate(new Mutation[]{mutation});
    if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) {
      throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg());
    } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) {
      throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg());
    } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.STORE_TOO_BUSY)) {
      throw new RegionTooBusyException(batchMutate[0].getExceptionMsg());
    }
  }

  /**
   * Complete taking the snapshot on the region. Writes the region info and adds references to the
   * working snapshot directory.
   *
   * TODO for api consistency, consider adding another version with no {@link ForeignExceptionSnare}
   * arg.  (In the future other cancellable HRegion methods could eventually add a
   * {@link ForeignExceptionSnare}, or we could do something fancier).
   *
   * @param desc snapshot description object
   * @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to
   *   bail out.  This is allowed to be null and will just be ignored in that case.
   * @throws IOException if there is an external or internal error causing the snapshot to fail
   */
  public void addRegionToSnapshot(SnapshotDescription desc,
      ForeignExceptionSnare exnSnare) throws IOException {
    Path rootDir = CommonFSUtils.getRootDir(conf);
    Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir, conf);

    SnapshotManifest manifest = SnapshotManifest.create(conf, getFilesystem(),
            snapshotDir, desc, exnSnare);
    manifest.addRegion(this);
  }

  private void updateSequenceId(final Iterable> cellItr, final long sequenceId)
      throws IOException {
    for (List cells: cellItr) {
      if (cells == null) return;
      for (Cell cell : cells) {
        PrivateCellUtil.setSequenceId(cell, sequenceId);
      }
    }
  }

  /**
   * Replace any cell timestamps set to {@link org.apache.hadoop.hbase.HConstants#LATEST_TIMESTAMP}
   * provided current timestamp.
   * @param cellItr
   * @param now
   */
  private static void updateCellTimestamps(final Iterable> cellItr, final byte[] now)
      throws IOException {
    for (List cells: cellItr) {
      if (cells == null) continue;
      // Optimization: 'foreach' loop is not used. See:
      // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects
      assert cells instanceof RandomAccess;
      int listSize = cells.size();
      for (int i = 0; i < listSize; i++) {
        PrivateCellUtil.updateLatestStamp(cells.get(i), now);
      }
    }
  }

  /**
   * Possibly rewrite incoming cell tags.
   */
  void rewriteCellTags(Map> familyMap, final Mutation m) {
    // Check if we have any work to do and early out otherwise
    // Update these checks as more logic is added here
    if (m.getTTL() == Long.MAX_VALUE) {
      return;
    }

    // From this point we know we have some work to do
    for (Map.Entry> e: familyMap.entrySet()) {
      List cells = e.getValue();
      assert cells instanceof RandomAccess;
      int listSize = cells.size();
      for (int i = 0; i < listSize; i++) {
        Cell cell = cells.get(i);
        List newTags = TagUtil.carryForwardTags(null, cell);
        newTags = TagUtil.carryForwardTTLTag(newTags, m.getTTL());
        // Rewrite the cell with the updated set of tags
        cells.set(i, PrivateCellUtil.createCell(cell, newTags));
      }
    }
  }

  /*
   * Check if resources to support an update.
   *
   * We throw RegionTooBusyException if above memstore limit
   * and expect client to retry using some kind of backoff
  */
  void checkResources() throws RegionTooBusyException {
    // If catalog region, do not impose resource constraints or block updates.
    if (this.getRegionInfo().isMetaRegion()) return;

    MemStoreSize mss = this.memStoreSizing.getMemStoreSize();
    if (mss.getHeapSize() + mss.getOffHeapSize() > this.blockingMemStoreSize) {
      blockedRequestsCount.increment();
      requestFlush();
      // Don't print current limit because it will vary too much. The message is used as a key
      // over in RetriesExhaustedWithDetailsException processing.
      final String regionName =
        this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getEncodedName();
      final String serverName = this.getRegionServerServices() == null ?
        "unknown" : (this.getRegionServerServices().getServerName() == null ? "unknown" :
          this.getRegionServerServices().getServerName().toString());
      RegionTooBusyException rtbe = new RegionTooBusyException(
        "Over memstore limit=" + org.apache.hadoop.hbase.procedure2.util.StringUtils
          .humanSize(this.blockingMemStoreSize) + ", regionName=" + regionName + ", server="
          + serverName);
      LOG.warn("Region is too busy due to exceeding memstore size limit.", rtbe);
      throw rtbe;
    }
  }

  /**
   * @throws IOException Throws exception if region is in read-only mode.
   */
  protected void checkReadOnly() throws IOException {
    if (isReadOnly()) {
      throw new DoNotRetryIOException("region is read only");
    }
  }

  protected void checkReadsEnabled() throws IOException {
    if (!this.writestate.readsEnabled) {
      throw new IOException(getRegionInfo().getEncodedName()
        + ": The region's reads are disabled. Cannot serve the request");
    }
  }

  public void setReadsEnabled(boolean readsEnabled) {
   if (readsEnabled && !this.writestate.readsEnabled) {
     LOG.info(getRegionInfo().getEncodedName() + " : Enabling reads for region.");
    }
    this.writestate.setReadsEnabled(readsEnabled);
  }

  /**
   * Add updates first to the wal and then add values to memstore.
   * Warning: Assumption is caller has lock on passed in row.
   * @param edits Cell updates by column
   * @throws IOException
   */
  void put(final byte [] row, byte [] family, List edits)
  throws IOException {
    NavigableMap> familyMap;
    familyMap = new TreeMap<>(Bytes.BYTES_COMPARATOR);

    familyMap.put(family, edits);
    Put p = new Put(row);
    p.setFamilyCellMap(familyMap);
    doBatchMutate(p);
  }

  /**
   * @param delta If we are doing delta changes -- e.g. increment/append -- then this flag will be
   *          set; when set we will run operations that make sense in the increment/append scenario
   *          but that do not make sense otherwise.
   * @see #applyToMemStore(HStore, Cell, MemStoreSizing)
   */
  private void applyToMemStore(HStore store, List cells, boolean delta,
      MemStoreSizing memstoreAccounting) throws IOException {
    // Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!!
    boolean upsert = delta && store.getColumnFamilyDescriptor().getMaxVersions() == 1;
    if (upsert) {
      store.upsert(cells, getSmallestReadPoint(), memstoreAccounting);
    } else {
      store.add(cells, memstoreAccounting);
    }
  }

  /**
   * @see #applyToMemStore(HStore, List, boolean, MemStoreSizing)
   */
  private void applyToMemStore(HStore store, Cell cell, MemStoreSizing memstoreAccounting)
      throws IOException {
    // Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!!
    if (store == null) {
      checkFamily(CellUtil.cloneFamily(cell));
      // Unreachable because checkFamily will throw exception
    }
    store.add(cell, memstoreAccounting);
  }

  /**
   * Check the collection of families for validity.
   * @param families
   * @throws NoSuchColumnFamilyException
   */
  public void checkFamilies(Collection families) throws NoSuchColumnFamilyException {
    for (byte[] family : families) {
      checkFamily(family);
    }
  }

  /**
   * Check the collection of families for valid timestamps
   * @param familyMap
   * @param now current timestamp
   * @throws FailedSanityCheckException
   */
  public void checkTimestamps(final Map> familyMap, long now)
      throws FailedSanityCheckException {
    if (timestampSlop == HConstants.LATEST_TIMESTAMP) {
      return;
    }
    long maxTs = now + timestampSlop;
    for (List kvs : familyMap.values()) {
      // Optimization: 'foreach' loop is not used. See:
      // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects
      assert kvs instanceof RandomAccess;
      int listSize  = kvs.size();
      for (int i=0; i < listSize; i++) {
        Cell cell = kvs.get(i);
        // see if the user-side TS is out of range. latest = server-side
        long ts = cell.getTimestamp();
        if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) {
          throw new FailedSanityCheckException("Timestamp for KV out of range "
              + cell + " (too.new=" + timestampSlop + ")");
        }
      }
    }
  }

  /*
   * @param size
   * @return True if size is over the flush threshold
   */
  private boolean isFlushSize(MemStoreSize size) {
    return size.getHeapSize() + size.getOffHeapSize() > getMemStoreFlushSize();
  }

  private void deleteRecoveredEdits(FileSystem fs, Iterable files) throws IOException {
    for (Path file : files) {
      if (!fs.delete(file, false)) {
        LOG.error("Failed delete of {}", file);
      } else {
        LOG.debug("Deleted recovered.edits file={}", file);
      }
    }
  }

  /**
   * Read the edits put under this region by wal splitting process.  Put
   * the recovered edits back up into this region.
   *
   * 
We can ignore any wal message that has a sequence ID that's equal to or
   * lower than minSeqId.  (Because we know such messages are already
   * reflected in the HFiles.)
   *
   * 
While this is running we are putting pressure on memory yet we are
   * outside of our usual accounting because we are not yet an onlined region
   * (this stuff is being run as part of Region initialization).  This means
   * that if we're up against global memory limits, we'll not be flagged to flush
   * because we are not online. We can't be flushed by usual mechanisms anyways;
   * we're not yet online so our relative sequenceids are not yet aligned with
   * WAL sequenceids -- not till we come up online, post processing of split
   * edits.
   *
   * 
But to help relieve memory pressure, at least manage our own heap size
   * flushing if are in excess of per-region limits.  Flushing, though, we have
   * to be careful and avoid using the regionserver/wal sequenceid.  Its running
   * on a different line to whats going on in here in this region context so if we
   * crashed replaying these edits, but in the midst had a flush that used the
   * regionserver wal with a sequenceid in excess of whats going on in here
   * in this region and with its split editlogs, then we could miss edits the
   * next time we go to recover. So, we have to flush inline, using seqids that
   * make sense in a this single region context only -- until we online.
   *
   * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of
   * the maxSeqId for the store to be applied, else its skipped.
   * @return the sequence id of the last edit added to this region out of the
   * recovered edits log or minSeqId if nothing added from editlogs.
   */
  @VisibleForTesting
  long replayRecoveredEditsIfAny(Map maxSeqIdInStores,
    final CancelableProgressable reporter, final MonitoredTask status) throws IOException {
    long minSeqIdForTheRegion = -1;
    for (Long maxSeqIdInStore : maxSeqIdInStores.values()) {
      if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) {
        minSeqIdForTheRegion = maxSeqIdInStore;
      }
    }
    long seqId = minSeqIdForTheRegion;
    String specialRecoveredEditsDirStr = conf.get(SPECIAL_RECOVERED_EDITS_DIR);
    if (org.apache.commons.lang3.StringUtils.isBlank(specialRecoveredEditsDirStr)) {
      FileSystem walFS = getWalFileSystem();
      FileSystem rootFS = getFilesystem();
      Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf, getRegionInfo().getTable(),
        getRegionInfo().getEncodedName());
      Path regionWALDir = getWALRegionDir();
      Path regionDir =
        FSUtils.getRegionDirFromRootDir(CommonFSUtils.getRootDir(conf), getRegionInfo());

      // We made a mistake in HBASE-20734 so we need to do this dirty hack...
      NavigableSet filesUnderWrongRegionWALDir =
        WALSplitUtil.getSplitEditFilesSorted(walFS, wrongRegionWALDir);
      seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS,
        filesUnderWrongRegionWALDir, reporter, regionDir));
      // This is to ensure backwards compatability with HBASE-20723 where recovered edits can appear
      // under the root dir even if walDir is set.
      NavigableSet filesUnderRootDir = Collections.emptyNavigableSet();
      if (!regionWALDir.equals(regionDir)) {
        filesUnderRootDir = WALSplitUtil.getSplitEditFilesSorted(rootFS, regionDir);
        seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, rootFS,
          filesUnderRootDir, reporter, regionDir));
      }

      NavigableSet files = WALSplitUtil.getSplitEditFilesSorted(walFS, regionWALDir);
      seqId = Math.max(seqId,
        replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS, files, reporter, regionWALDir));
      if (seqId > minSeqIdForTheRegion) {
        // Then we added some edits to memory. Flush and cleanup split edit files.
        internalFlushcache(null, seqId, stores.values(), status, false,
          FlushLifeCycleTracker.DUMMY);
      }
      // Now delete the content of recovered edits. We're done w/ them.
      if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) {
        // For debugging data loss issues!
        // If this flag is set, make use of the hfile archiving by making recovered.edits a fake
        // column family. Have to fake out file type too by casting our recovered.edits as
        // storefiles
        String fakeFamilyName = WALSplitUtil.getRegionDirRecoveredEditsDir(regionWALDir).getName();
        Set fakeStoreFiles = new HashSet<>(files.size());
        for (Path file : files) {
          fakeStoreFiles.add(new HStoreFile(walFS, file, this.conf, null, null, true));
        }
        getRegionWALFileSystem().archiveRecoveredEdits(fakeFamilyName, fakeStoreFiles);
      } else {
        deleteRecoveredEdits(walFS, Iterables.concat(files, filesUnderWrongRegionWALDir));
        deleteRecoveredEdits(rootFS, filesUnderRootDir);
      }
    } else {
      Path recoveredEditsDir = new Path(specialRecoveredEditsDirStr);
      FileSystem fs = recoveredEditsDir.getFileSystem(conf);
      FileStatus[] files = fs.listStatus(recoveredEditsDir);
      LOG.debug("Found {} recovered edits file(s) under {}", files == null ? 0 : files.length,
        recoveredEditsDir);
      if (files != null) {
        for (FileStatus file : files) {
          seqId =
            Math.max(seqId, replayRecoveredEdits(file.getPath(), maxSeqIdInStores, reporter, fs));
        }
      }
      if (seqId > minSeqIdForTheRegion) {
        // Then we added some edits to memory. Flush and cleanup split edit files.
        internalFlushcache(null, seqId, stores.values(), status, false,
          FlushLifeCycleTracker.DUMMY);
      }
      deleteRecoveredEdits(fs,
        Stream.of(files).map(FileStatus::getPath).collect(Collectors.toList()));
    }

    return seqId;
  }

  private long replayRecoveredEditsForPaths(long minSeqIdForTheRegion, FileSystem fs,
      final NavigableSet files, final CancelableProgressable reporter, final Path regionDir)
      throws IOException {
    long seqid = minSeqIdForTheRegion;
    if (LOG.isDebugEnabled()) {
      LOG.debug("Found " + (files == null ? 0 : files.size())
          + " recovered edits file(s) under " + regionDir);
    }

    if (files == null || files.isEmpty()) {
      return minSeqIdForTheRegion;
    }

    for (Path edits: files) {
      if (edits == null || !fs.exists(edits)) {
        LOG.warn("Null or non-existent edits file: " + edits);
        continue;
      }
      if (isZeroLengthThenDelete(fs, fs.getFileStatus(edits), edits)) {
        continue;
      }

      long maxSeqId;
      String fileName = edits.getName();
      maxSeqId = Math.abs(Long.parseLong(fileName));
      if (maxSeqId <= minSeqIdForTheRegion) {
        if (LOG.isDebugEnabled()) {
          String msg = "Maximum sequenceid for this wal is " + maxSeqId
              + " and minimum sequenceid for the region " + this + "  is " + minSeqIdForTheRegion
              + ", skipped the whole file, path=" + edits;
          LOG.debug(msg);
        }
        continue;
      }

      try {
        // replay the edits. Replay can return -1 if everything is skipped, only update
        // if seqId is greater
        seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter, fs));
      } catch (IOException e) {
        handleException(fs, edits, e);
      }
    }
    return seqid;
  }

  private void handleException(FileSystem fs, Path edits, IOException e) throws IOException {
    boolean skipErrors = conf.getBoolean(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS,
      conf.getBoolean("hbase.skip.errors", HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS));
    if (conf.get("hbase.skip.errors") != null) {
      LOG.warn("The property 'hbase.skip.errors' has been deprecated. Please use "
          + HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead.");
    }
    if (skipErrors) {
      Path p = WALSplitUtil.moveAsideBadEditsFile(fs, edits);
      LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + "=true so continuing. Renamed "
          + edits + " as " + p,
        e);
    } else {
      throw e;
    }
  }

  /**
   * @param edits File of recovered edits.
   * @param maxSeqIdInStores Maximum sequenceid found in each store. Edits in wal must be larger
   *          than this to be replayed for each store.
   * @return the sequence id of the last edit added to this region out of the recovered edits log or
   *         minSeqId if nothing added from editlogs.
   */
  private long replayRecoveredEdits(final Path edits, Map maxSeqIdInStores,
    final CancelableProgressable reporter, FileSystem fs) throws IOException {
    String msg = "Replaying edits from " + edits;
    LOG.info(msg);
    MonitoredTask status = TaskMonitor.get().createStatus(msg);

    status.setStatus("Opening recovered edits");
    WAL.Reader reader = null;
    try {
      reader = WALFactory.createReader(fs, edits, conf);
      long currentEditSeqId = -1;
      long currentReplaySeqId = -1;
      long firstSeqIdInLog = -1;
      long skippedEdits = 0;
      long editsCount = 0;
      long intervalEdits = 0;
      WAL.Entry entry;
      HStore store = null;
      boolean reported_once = false;
      ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager();

      try {
        // How many edits seen before we check elapsed time
        int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000);
        // How often to send a progress report (default 1/2 master timeout)
        int period = this.conf.getInt("hbase.hstore.report.period", 300000);
        long lastReport = EnvironmentEdgeManager.currentTime();

        if (coprocessorHost != null) {
          coprocessorHost.preReplayWALs(this.getRegionInfo(), edits);
        }

        while ((entry = reader.next()) != null) {
          WALKey key = entry.getKey();
          WALEdit val = entry.getEdit();

          if (ng != null) { // some test, or nonces disabled
            ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime());
          }

          if (reporter != null) {
            intervalEdits += val.size();
            if (intervalEdits >= interval) {
              // Number of edits interval reached
              intervalEdits = 0;
              long cur = EnvironmentEdgeManager.currentTime();
              if (lastReport + period <= cur) {
                status.setStatus("Replaying edits..." +
                    " skipped=" + skippedEdits +
                    " edits=" + editsCount);
                // Timeout reached
                if(!reporter.progress()) {
                  msg = "Progressable reporter failed, stopping replay for region " + this;
                  LOG.warn(msg);
                  status.abort(msg);
                  throw new IOException(msg);
                }
                reported_once = true;
                lastReport = cur;
              }
            }
          }

          if (firstSeqIdInLog == -1) {
            firstSeqIdInLog = key.getSequenceId();
          }
          if (currentEditSeqId > key.getSequenceId()) {
            // when this condition is true, it means we have a serious defect because we need to
            // maintain increasing SeqId for WAL edits per region
            LOG.error(getRegionInfo().getEncodedName() + " : "
                 + "Found decreasing SeqId. PreId=" + currentEditSeqId + " key=" + key
                + "; edit=" + val);
          } else {
            currentEditSeqId = key.getSequenceId();
          }
          currentReplaySeqId = (key.getOrigLogSeqNum() > 0) ?
            key.getOrigLogSeqNum() : currentEditSeqId;

          // Start coprocessor replay here. The coprocessor is for each WALEdit
          // instead of a KeyValue.
          if (coprocessorHost != null) {
            status.setStatus("Running pre-WAL-restore hook in coprocessors");
            if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
              // if bypass this wal entry, ignore it ...
              continue;
            }
          }
          boolean checkRowWithinBoundary = false;
          // Check this edit is for this region.
          if (!Bytes.equals(key.getEncodedRegionName(),
              this.getRegionInfo().getEncodedNameAsBytes())) {
            checkRowWithinBoundary = true;
          }

          boolean flush = false;
          MemStoreSizing memStoreSizing = new NonThreadSafeMemStoreSizing();
          for (Cell cell: val.getCells()) {
            // Check this edit is for me. Also, guard against writing the special
            // METACOLUMN info such as HBASE::CACHEFLUSH entries
            if (WALEdit.isMetaEditFamily(cell)) {
              // if region names don't match, skipp replaying compaction marker
              if (!checkRowWithinBoundary) {
                //this is a special edit, we should handle it
                CompactionDescriptor compaction = WALEdit.getCompaction(cell);
                if (compaction != null) {
                  //replay the compaction
                  replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE);
                }
              }
              skippedEdits++;
              continue;
            }
            // Figure which store the edit is meant for.
            if (store == null || !CellUtil.matchingFamily(cell,
                store.getColumnFamilyDescriptor().getName())) {
              store = getStore(cell);
            }
            if (store == null) {
              // This should never happen.  Perhaps schema was changed between
              // crash and redeploy?
              LOG.warn("No family for cell {} in region {}", cell, this);
              skippedEdits++;
              continue;
            }
            if (checkRowWithinBoundary && !rowIsInRange(this.getRegionInfo(),
              cell.getRowArray(), cell.getRowOffset(), cell.getRowLength())) {
              LOG.warn("Row of {} is not within region boundary for region {}", cell, this);
              skippedEdits++;
              continue;
            }
            // Now, figure if we should skip this edit.
            if (key.getSequenceId() <= maxSeqIdInStores.get(store.getColumnFamilyDescriptor()
                .getName())) {
              skippedEdits++;
              continue;
            }
            PrivateCellUtil.setSequenceId(cell, currentReplaySeqId);

            restoreEdit(store, cell, memStoreSizing);
            editsCount++;
          }
          MemStoreSize mss = memStoreSizing.getMemStoreSize();
          incMemStoreSize(mss);
          flush = isFlushSize(this.memStoreSizing.getMemStoreSize());
          if (flush) {
            internalFlushcache(null, currentEditSeqId, stores.values(), status, false,
              FlushLifeCycleTracker.DUMMY);
          }

          if (coprocessorHost != null) {
            coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
          }
        }

        if (coprocessorHost != null) {
          coprocessorHost.postReplayWALs(this.getRegionInfo(), edits);
        }
      } catch (EOFException eof) {
        Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits);
        msg = "EnLongAddered EOF. Most likely due to Master failure during "
            + "wal splitting, so we have this data in another edit. Continuing, but renaming "
            + edits + " as " + p + " for region " + this;
        LOG.warn(msg, eof);
        status.abort(msg);
      } catch (IOException ioe) {
        // If the IOE resulted from bad file format,
        // then this problem is idempotent and retrying won't help
        if (ioe.getCause() instanceof ParseException) {
          Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits);
          msg = "File corruption enLongAddered!  " +
              "Continuing, but renaming " + edits + " as " + p;
          LOG.warn(msg, ioe);
          status.setStatus(msg);
        } else {
          status.abort(StringUtils.stringifyException(ioe));
          // other IO errors may be transient (bad network connection,
          // checksum exception on one datanode, etc).  throw & retry
          throw ioe;
        }
      }
      if (reporter != null && !reported_once) {
        reporter.progress();
      }
      msg = "Applied " + editsCount + ", skipped " + skippedEdits +
        ", firstSequenceIdInLog=" + firstSeqIdInLog +
        ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits;
      status.markComplete(msg);
      LOG.debug(msg);
      return currentEditSeqId;
    } finally {
      status.cleanup();
      if (reader != null) {
         reader.close();
      }
    }
  }

  /**
   * Call to complete a compaction. Its for the case where we find in the WAL a compaction
   * that was not finished.  We could find one recovering a WAL after a regionserver crash.
   * See HBASE-2331.
   */
  void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles,
      boolean removeFiles, long replaySeqId)
      throws IOException {
    try {
      checkTargetRegion(compaction.getEncodedRegionName().toByteArray(),
        "Compaction marker from WAL ", compaction);
    } catch (WrongRegionException wre) {
      if (RegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
        // skip the compaction marker since it is not for this region
        return;
      }
      throw wre;
    }

    synchronized (writestate) {
      if (replaySeqId < lastReplayedOpenRegionSeqId) {
        LOG.warn(getRegionInfo().getEncodedName() + " : "
            + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
            + " because its sequence id " + replaySeqId + " is smaller than this regions "
            + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
        return;
      }
      if (replaySeqId < lastReplayedCompactionSeqId) {
        LOG.warn(getRegionInfo().getEncodedName() + " : "
            + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
            + " because its sequence id " + replaySeqId + " is smaller than this regions "
            + "lastReplayedCompactionSeqId of " + lastReplayedCompactionSeqId);
        return;
      } else {
        lastReplayedCompactionSeqId = replaySeqId;
      }

      if (LOG.isDebugEnabled()) {
        LOG.debug(getRegionInfo().getEncodedName() + " : "
            + "Replaying compaction marker " + TextFormat.shortDebugString(compaction)
            + " with seqId=" + replaySeqId + " and lastReplayedOpenRegionSeqId="
            + lastReplayedOpenRegionSeqId);
      }

      startRegionOperation(Operation.REPLAY_EVENT);
      try {
        HStore store = this.getStore(compaction.getFamilyName().toByteArray());
        if (store == null) {
          LOG.warn(getRegionInfo().getEncodedName() + " : "
              + "Found Compaction WAL edit for deleted family:"
              + Bytes.toString(compaction.getFamilyName().toByteArray()));
          return;
        }
        store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles);
        logRegionFiles();
      } catch (FileNotFoundException ex) {
        LOG.warn(getRegionInfo().getEncodedName() + " : "
            + "At least one of the store files in compaction: "
            + TextFormat.shortDebugString(compaction)
            + " doesn't exist any more. Skip loading the file(s)", ex);
      } finally {
        closeRegionOperation(Operation.REPLAY_EVENT);
      }
    }
  }

  void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException {
    checkTargetRegion(flush.getEncodedRegionName().toByteArray(),
      "Flush marker from WAL ", flush);

    if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
      return; // if primary nothing to do
    }

    if (LOG.isDebugEnabled()) {
      LOG.debug(getRegionInfo().getEncodedName() + " : "
          + "Replaying flush marker " + TextFormat.shortDebugString(flush));
    }

    startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close
    try {
      FlushAction action = flush.getAction();
      switch (action) {
      case START_FLUSH:
        replayWALFlushStartMarker(flush);
        break;
      case COMMIT_FLUSH:
        replayWALFlushCommitMarker(flush);
        break;
      case ABORT_FLUSH:
        replayWALFlushAbortMarker(flush);
        break;
      case CANNOT_FLUSH:
        replayWALFlushCannotFlushMarker(flush, replaySeqId);
        break;
      default:
        LOG.warn(getRegionInfo().getEncodedName() + " : " +
          "Received a flush event with unknown action, ignoring. " +
          TextFormat.shortDebugString(flush));
        break;
      }

      logRegionFiles();
    } finally {
      closeRegionOperation(Operation.REPLAY_EVENT);
    }
  }

  /** Replay the flush marker from primary region by creating a corresponding snapshot of
   * the store memstores, only if the memstores do not have a higher seqId from an earlier wal
   * edit (because the events may be coming out of order).
   */
  @VisibleForTesting
  PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException {
    long flushSeqId = flush.getFlushSequenceNumber();

    HashSet storesToFlush = new HashSet<>();
    for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
      byte[] family = storeFlush.getFamilyName().toByteArray();
      HStore store = getStore(family);
      if (store == null) {
        LOG.warn(getRegionInfo().getEncodedName() + " : "
          + "Received a flush start marker from primary, but the family is not found. Ignoring"
          + " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush));
        continue;
      }
      storesToFlush.add(store);
    }

    MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this);

    // we will use writestate as a coarse-grain lock for all the replay events
    // (flush, compaction, region open etc)
    synchronized (writestate) {
      try {
        if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
          LOG.warn(getRegionInfo().getEncodedName() + " : "
              + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
              + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
              + " of " + lastReplayedOpenRegionSeqId);
          return null;
        }
        if (numMutationsWithoutWAL.sum() > 0) {
          numMutationsWithoutWAL.reset();
          dataInMemoryWithoutWAL.reset();
        }

        if (!writestate.flushing) {
          // we do not have an active snapshot and corresponding this.prepareResult. This means
          // we can just snapshot our memstores and continue as normal.

          // invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal
          PrepareFlushResult prepareResult = internalPrepareFlushCache(null, flushSeqId,
            storesToFlush, status, false, FlushLifeCycleTracker.DUMMY);
          if (prepareResult.result == null) {
            // save the PrepareFlushResult so that we can use it later from commit flush
            this.writestate.flushing = true;
            this.prepareFlushResult = prepareResult;
            status.markComplete("Flush prepare successful");
            if (LOG.isDebugEnabled()) {
              LOG.debug(getRegionInfo().getEncodedName() + " : "
                  + " Prepared flush with seqId:" + flush.getFlushSequenceNumber());
            }
          } else {
            // special case empty memstore. We will still save the flush result in this case, since
            // our memstore ie empty, but the primary is still flushing
            if (prepareResult.getResult().getResult() ==
                  FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
              this.writestate.flushing = true;
              this.prepareFlushResult = prepareResult;
              if (LOG.isDebugEnabled()) {
                LOG.debug(getRegionInfo().getEncodedName() + " : "
                  + " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber());
              }
            }
            status.abort("Flush prepare failed with " + prepareResult.result);
            // nothing much to do. prepare flush failed because of some reason.
          }
          return prepareResult;
        } else {
          // we already have an active snapshot.
          if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) {
            // They define the same flush. Log and continue.
            LOG.warn(getRegionInfo().getEncodedName() + " : "
                + "Received a flush prepare marker with the same seqId: " +
                + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
                + prepareFlushResult.flushOpSeqId + ". Ignoring");
            // ignore
          } else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) {
            // We received a flush with a smaller seqNum than what we have prepared. We can only
            // ignore this prepare flush request.
            LOG.warn(getRegionInfo().getEncodedName() + " : "
                + "Received a flush prepare marker with a smaller seqId: " +
                + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
                + prepareFlushResult.flushOpSeqId + ". Ignoring");
            // ignore
          } else {
            // We received a flush with a larger seqNum than what we have prepared
            LOG.warn(getRegionInfo().getEncodedName() + " : "
                + "Received a flush prepare marker with a larger seqId: " +
                + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
                + prepareFlushResult.flushOpSeqId + ". Ignoring");
            // We do not have multiple active snapshots in the memstore or a way to merge current
            // memstore snapshot with the contents and resnapshot for now. We cannot take
            // another snapshot and drop the previous one because that will cause temporary
            // data loss in the secondary. So we ignore this for now, deferring the resolution
            // to happen when we see the corresponding flush commit marker. If we have a memstore
            // snapshot with x, and later received another prepare snapshot with y (where x < y),
            // when we see flush commit for y, we will drop snapshot for x, and can also drop all
            // the memstore edits if everything in memstore is < y. This is the usual case for
            // RS crash + recovery where we might see consequtive prepare flush wal markers.
            // Otherwise, this will cause more memory to be used in secondary replica until a
            // further prapare + commit flush is seen and replayed.
          }
        }
      } finally {
        status.cleanup();
        writestate.notifyAll();
      }
    }
    return null;
  }

  @VisibleForTesting
  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
    justification="Intentional; post memstore flush")
  void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException {
    MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this);

    // check whether we have the memstore snapshot with the corresponding seqId. Replay to
    // secondary region replicas are in order, except for when the region moves or then the
    // region server crashes. In those cases, we may receive replay requests out of order from
    // the original seqIds.
    synchronized (writestate) {
      try {
        if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
          LOG.warn(getRegionInfo().getEncodedName() + " : "
            + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
            + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
            + " of " + lastReplayedOpenRegionSeqId);
          return;
        }

        if (writestate.flushing) {
          PrepareFlushResult prepareFlushResult = this.prepareFlushResult;
          if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) {
            if (LOG.isDebugEnabled()) {
              LOG.debug(getRegionInfo().getEncodedName() + " : "
                  + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
                  + " and a previous prepared snapshot was found");
            }
            // This is the regular case where we received commit flush after prepare flush
            // corresponding to the same seqId.
            replayFlushInStores(flush, prepareFlushResult, true);

            // Set down the memstore size by amount of flush.
            this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize());
            this.prepareFlushResult = null;
            writestate.flushing = false;
          } else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) {
            // This should not happen normally. However, lets be safe and guard against these cases
            // we received a flush commit with a smaller seqId than what we have prepared
            // we will pick the flush file up from this commit (if we have not seen it), but we
            // will not drop the memstore
            LOG.warn(getRegionInfo().getEncodedName() + " : "
                + "Received a flush commit marker with smaller seqId: "
                + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: "
                + prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping"
                +"  prepared memstore snapshot");
            replayFlushInStores(flush, prepareFlushResult, false);

            // snapshot is not dropped, so memstore sizes should not be decremented
            // we still have the prepared snapshot, flushing should still be true
          } else {
            // This should not happen normally. However, lets be safe and guard against these cases
            // we received a flush commit with a larger seqId than what we have prepared
            // we will pick the flush file for this. We will also obtain the updates lock and
            // look for contents of the memstore to see whether we have edits after this seqId.
            // If not, we will drop all the memstore edits and the snapshot as well.
            LOG.warn(getRegionInfo().getEncodedName() + " : "
                + "Received a flush commit marker with larger seqId: "
                + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " +
                prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared"
                +" memstore snapshot");

            replayFlushInStores(flush, prepareFlushResult, true);

            // Set down the memstore size by amount of flush.
            this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize());

            // Inspect the memstore contents to see whether the memstore contains only edits
            // with seqId smaller than the flush seqId. If so, we can discard those edits.
            dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null);

            this.prepareFlushResult = null;
            writestate.flushing = false;
          }
          // If we were waiting for observing a flush or region opening event for not showing
          // partial data after a secondary region crash, we can allow reads now. We can only make
          // sure that we are not showing partial data (for example skipping some previous edits)
          // until we observe a full flush start and flush commit. So if we were not able to find
          // a previous flush we will not enable reads now.
          this.setReadsEnabled(true);
        } else {
          LOG.warn(getRegionInfo().getEncodedName() + " : "
              + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
              + ", but no previous prepared snapshot was found");
          // There is no corresponding prepare snapshot from before.
          // We will pick up the new flushed file
          replayFlushInStores(flush, null, false);

          // Inspect the memstore contents to see whether the memstore contains only edits
          // with seqId smaller than the flush seqId. If so, we can discard those edits.
          dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
        }

        status.markComplete("Flush commit successful");

        // Update the last flushed sequence id for region.
        this.maxFlushedSeqId = flush.getFlushSequenceNumber();

        // advance the mvcc read point so that the new flushed file is visible.
        mvcc.advanceTo(flush.getFlushSequenceNumber());

      } catch (FileNotFoundException ex) {
        LOG.warn(getRegionInfo().getEncodedName() + " : "
            + "At least one of the store files in flush: " + TextFormat.shortDebugString(flush)
            + " doesn't exist any more. Skip loading the file(s)", ex);
      }
      finally {
        status.cleanup();
        writestate.notifyAll();
      }
    }

    // C. Finally notify anyone waiting on memstore to clear:
    // e.g. checkResources().
    synchronized (this) {
      notifyAll(); // FindBugs NN_NAKED_NOTIFY
    }
  }

  /**
   * Replays the given flush descriptor by opening the flush files in stores and dropping the
   * memstore snapshots if requested.
   * @param flush
   * @param prepareFlushResult
   * @param dropMemstoreSnapshot
   * @throws IOException
   */
  private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult,
      boolean dropMemstoreSnapshot)
      throws IOException {
    for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
      byte[] family = storeFlush.getFamilyName().toByteArray();
      HStore store = getStore(family);
      if (store == null) {
        LOG.warn(getRegionInfo().getEncodedName() + " : "
            + "Received a flush commit marker from primary, but the family is not found."
            + "Ignoring StoreFlushDescriptor:" + storeFlush);
        continue;
      }
      List flushFiles = storeFlush.getFlushOutputList();
      StoreFlushContext ctx = null;
      long startTime = EnvironmentEdgeManager.currentTime();
      if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) {
        ctx = store.createFlushContext(flush.getFlushSequenceNumber(), FlushLifeCycleTracker.DUMMY);
      } else {
        ctx = prepareFlushResult.storeFlushCtxs.get(family);
        startTime = prepareFlushResult.startTime;
      }

      if (ctx == null) {
        LOG.warn(getRegionInfo().getEncodedName() + " : "
            + "Unexpected: flush commit marker received from store "
            + Bytes.toString(family) + " but no associated flush context. Ignoring");
        continue;
      }

      ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush

      // Record latest flush time
      this.lastStoreFlushTimeMap.put(store, startTime);
    }
  }

  private long loadRecoveredHFilesIfAny(Collection stores) throws IOException {
    Path regionDir = fs.getRegionDir();
    long maxSeqId = -1;
    for (HStore store : stores) {
      String familyName = store.getColumnFamilyName();
      FileStatus[] files =
          WALSplitUtil.getRecoveredHFiles(fs.getFileSystem(), regionDir, familyName);
      if (files != null && files.length != 0) {
        for (FileStatus file : files) {
          Path filePath = file.getPath();
          // If file length is zero then delete it
          if (isZeroLengthThenDelete(fs.getFileSystem(), file, filePath)) {
            continue;
          }
          try {
            HStoreFile storefile = store.tryCommitRecoveredHFile(file.getPath());
            maxSeqId = Math.max(maxSeqId, storefile.getReader().getSequenceID());
          } catch (IOException e) {
            handleException(fs.getFileSystem(), filePath, e);
            continue;
          }
        }
        if (this.rsServices != null && store.needsCompaction()) {
          this.rsServices.getCompactionRequestor()
              .requestCompaction(this, store, "load recovered hfiles request compaction",
                  Store.PRIORITY_USER + 1, CompactionLifeCycleTracker.DUMMY, null);
        }
      }
    }
    return maxSeqId;
  }

  /**
   * Be careful, this method will drop all data in the memstore of this region.
   * Currently, this method is used to drop memstore to prevent memory leak
   * when replaying recovered.edits while opening region.
   */
  public MemStoreSize dropMemStoreContents() throws IOException {
    MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing();
    this.updatesLock.writeLock().lock();
    try {
      for (HStore s : stores.values()) {
        MemStoreSize memStoreSize = doDropStoreMemStoreContentsForSeqId(s, HConstants.NO_SEQNUM);
        LOG.info("Drop memstore for Store " + s.getColumnFamilyName() + " in region "
                + this.getRegionInfo().getRegionNameAsString()
                + " , dropped memstoresize: [" + memStoreSize + " }");
        totalFreedSize.incMemStoreSize(memStoreSize);
      }
      return totalFreedSize.getMemStoreSize();
    } finally {
      this.updatesLock.writeLock().unlock();
    }
  }

  /**
   * Drops the memstore contents after replaying a flush descriptor or region open event replay
   * if the memstore edits have seqNums smaller than the given seq id
   * @throws IOException
   */
  private MemStoreSize dropMemStoreContentsForSeqId(long seqId, HStore store) throws IOException {
    MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing();
    this.updatesLock.writeLock().lock();
    try {

      long currentSeqId = mvcc.getReadPoint();
      if (seqId >= currentSeqId) {
        // then we can drop the memstore contents since everything is below this seqId
        LOG.info(getRegionInfo().getEncodedName() + " : "
            + "Dropping memstore contents as well since replayed flush seqId: "
            + seqId + " is greater than current seqId:" + currentSeqId);

        // Prepare flush (take a snapshot) and then abort (drop the snapshot)
        if (store == null) {
          for (HStore s : stores.values()) {
            totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(s, currentSeqId));
          }
        } else {
          totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(store, currentSeqId));
        }
      } else {
        LOG.info(getRegionInfo().getEncodedName() + " : "
            + "Not dropping memstore contents since replayed flush seqId: "
            + seqId + " is smaller than current seqId:" + currentSeqId);
      }
    } finally {
      this.updatesLock.writeLock().unlock();
    }
    return totalFreedSize.getMemStoreSize();
  }

  private MemStoreSize doDropStoreMemStoreContentsForSeqId(HStore s, long currentSeqId)
      throws IOException {
    MemStoreSize flushableSize = s.getFlushableSize();
    this.decrMemStoreSize(flushableSize);
    StoreFlushContext ctx = s.createFlushContext(currentSeqId, FlushLifeCycleTracker.DUMMY);
    ctx.prepare();
    ctx.abort();
    return flushableSize;
  }

  private void replayWALFlushAbortMarker(FlushDescriptor flush) {
    // nothing to do for now. A flush abort will cause a RS abort which means that the region
    // will be opened somewhere else later. We will see the region open event soon, and replaying
    // that will drop the snapshot
  }

  private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) {
    synchronized (writestate) {
      if (this.lastReplayedOpenRegionSeqId > replaySeqId) {
        LOG.warn(getRegionInfo().getEncodedName() + " : "
          + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
          + " because its sequence id " + replaySeqId + " is smaller than this regions "
          + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
        return;
      }

      // If we were waiting for observing a flush or region opening event for not showing partial
      // data after a secondary region crash, we can allow reads now. This event means that the
      // primary was not able to flush because memstore is empty when we requested flush. By the
      // time we observe this, we are guaranteed to have up to date seqId with our previous
      // assignment.
      this.setReadsEnabled(true);
    }
  }

  @VisibleForTesting
  PrepareFlushResult getPrepareFlushResult() {
    return prepareFlushResult;
  }

  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
      justification="Intentional; cleared the memstore")
  void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException {
    checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(),
      "RegionEvent marker from WAL ", regionEvent);

    startRegionOperation(Operation.REPLAY_EVENT);
    try {
      if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
        return; // if primary nothing to do
      }

      if (regionEvent.getEventType() == EventType.REGION_CLOSE) {
        // nothing to do on REGION_CLOSE for now.
        return;
      }
      if (regionEvent.getEventType() != EventType.REGION_OPEN) {
        LOG.warn(getRegionInfo().getEncodedName() + " : "
            + "Unknown region event received, ignoring :"
            + TextFormat.shortDebugString(regionEvent));
        return;
      }

      if (LOG.isDebugEnabled()) {
        LOG.debug(getRegionInfo().getEncodedName() + " : "
          + "Replaying region open event marker " + TextFormat.shortDebugString(regionEvent));
      }

      // we will use writestate as a coarse-grain lock for all the replay events
      synchronized (writestate) {
        // Replication can deliver events out of order when primary region moves or the region
        // server crashes, since there is no coordination between replication of different wal files
        // belonging to different region servers. We have to safe guard against this case by using
        // region open event's seqid. Since this is the first event that the region puts (after
        // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
        // smaller than this seqId
        if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) {
          this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber();
        } else {
          LOG.warn(getRegionInfo().getEncodedName() + " : "
            + "Skipping replaying region event :" + TextFormat.shortDebugString(regionEvent)
            + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
            + " of " + lastReplayedOpenRegionSeqId);
          return;
        }

        // region open lists all the files that the region has at the time of the opening. Just pick
        // all the files and drop prepared flushes and empty memstores
        for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) {
          // stores of primary may be different now
          byte[] family = storeDescriptor.getFamilyName().toByteArray();
          HStore store = getStore(family);
          if (store == null) {
            LOG.warn(getRegionInfo().getEncodedName() + " : "
                + "Received a region open marker from primary, but the family is not found. "
                + "Ignoring. StoreDescriptor:" + storeDescriptor);
            continue;
          }

          long storeSeqId = store.getMaxSequenceId().orElse(0L);
          List storeFiles = storeDescriptor.getStoreFileList();
          try {
            store.refreshStoreFiles(storeFiles); // replace the files with the new ones
          } catch (FileNotFoundException ex) {
            LOG.warn(getRegionInfo().getEncodedName() + " : "
                    + "At least one of the store files: " + storeFiles
                    + " doesn't exist any more. Skip loading the file(s)", ex);
            continue;
          }
          if (store.getMaxSequenceId().orElse(0L) != storeSeqId) {
            // Record latest flush time if we picked up new files
            lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime());
          }

          if (writestate.flushing) {
            // only drop memstore snapshots if they are smaller than last flush for the store
            if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) {
              StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
                  null : this.prepareFlushResult.storeFlushCtxs.get(family);
              if (ctx != null) {
                MemStoreSize mss = store.getFlushableSize();
                ctx.abort();
                this.decrMemStoreSize(mss);
                this.prepareFlushResult.storeFlushCtxs.remove(family);
              }
            }
          }

          // Drop the memstore contents if they are now smaller than the latest seen flushed file
          dropMemStoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store);
          if (storeSeqId > this.maxFlushedSeqId) {
            this.maxFlushedSeqId = storeSeqId;
          }
        }

        // if all stores ended up dropping their snapshots, we can safely drop the
        // prepareFlushResult
        dropPrepareFlushIfPossible();

        // advance the mvcc read point so that the new flushed file is visible.
        mvcc.await();

        // If we were waiting for observing a flush or region opening event for not showing partial
        // data after a secondary region crash, we can allow reads now.
        this.setReadsEnabled(true);

        // C. Finally notify anyone waiting on memstore to clear:
        // e.g. checkResources().
        synchronized (this) {
          notifyAll(); // FindBugs NN_NAKED_NOTIFY
        }
      }
      logRegionFiles();
    } finally {
      closeRegionOperation(Operation.REPLAY_EVENT);
    }
  }

  void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException {
    checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(),
      "BulkLoad marker from WAL ", bulkLoadEvent);

    if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
      return; // if primary nothing to do
    }

    if (LOG.isDebugEnabled()) {
      LOG.debug(getRegionInfo().getEncodedName() + " : "
              +  "Replaying bulkload event marker " + TextFormat.shortDebugString(bulkLoadEvent));
    }
    // check if multiple families involved
    boolean multipleFamilies = false;
    byte[] family = null;
    for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
      byte[] fam = storeDescriptor.getFamilyName().toByteArray();
      if (family == null) {
        family = fam;
      } else if (!Bytes.equals(family, fam)) {
        multipleFamilies = true;
        break;
      }
    }

    startBulkRegionOperation(multipleFamilies);
    try {
      // we will use writestate as a coarse-grain lock for all the replay events
      synchronized (writestate) {
        // Replication can deliver events out of order when primary region moves or the region
        // server crashes, since there is no coordination between replication of different wal files
        // belonging to different region servers. We have to safe guard against this case by using
        // region open event's seqid. Since this is the first event that the region puts (after
        // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
        // smaller than this seqId
        if (bulkLoadEvent.getBulkloadSeqNum() >= 0
            && this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum()) {
          LOG.warn(getRegionInfo().getEncodedName() + " : "
              + "Skipping replaying bulkload event :"
              + TextFormat.shortDebugString(bulkLoadEvent)
              + " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId"
              + " =" + lastReplayedOpenRegionSeqId);

          return;
        }

        for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
          // stores of primary may be different now
          family = storeDescriptor.getFamilyName().toByteArray();
          HStore store = getStore(family);
          if (store == null) {
            LOG.warn(getRegionInfo().getEncodedName() + " : "
                    + "Received a bulk load marker from primary, but the family is not found. "
                    + "Ignoring. StoreDescriptor:" + storeDescriptor);
            continue;
          }

          List storeFiles = storeDescriptor.getStoreFileList();
          for (String storeFile : storeFiles) {
            StoreFileInfo storeFileInfo = null;
            try {
              storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile);
              store.bulkLoadHFile(storeFileInfo);
            } catch(FileNotFoundException ex) {
              LOG.warn(getRegionInfo().getEncodedName() + " : "
                      + ((storeFileInfo != null) ? storeFileInfo.toString() :
                            (new Path(Bytes.toString(family), storeFile)).toString())
                      + " doesn't exist any more. Skip loading the file");
            }
          }
        }
      }
      if (bulkLoadEvent.getBulkloadSeqNum() > 0) {
        mvcc.advanceTo(bulkLoadEvent.getBulkloadSeqNum());
      }
    } finally {
      closeBulkRegionOperation();
    }
  }

  /**
   * If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult
   */
  private void dropPrepareFlushIfPossible() {
    if (writestate.flushing) {
      boolean canDrop = true;
      if (prepareFlushResult.storeFlushCtxs != null) {
        for (Entry entry : prepareFlushResult.storeFlushCtxs
            .entrySet()) {
          HStore store = getStore(entry.getKey());
          if (store == null) {
            continue;
          }
          if (store.getSnapshotSize().getDataSize() > 0) {
            canDrop = false;
            break;
          }
        }
      }

      // this means that all the stores in the region has finished flushing, but the WAL marker
      // may not have been written or we did not receive it yet.
      if (canDrop) {
        writestate.flushing = false;
        this.prepareFlushResult = null;
      }
    }
  }

  @Override
  public boolean refreshStoreFiles() throws IOException {
    return refreshStoreFiles(false);
  }

  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY",
      justification = "Notify is about post replay. Intentional")
  protected boolean refreshStoreFiles(boolean force) throws IOException {
    if (!force && ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
      return false; // if primary nothing to do
    }

    if (LOG.isDebugEnabled()) {
      LOG.debug(getRegionInfo().getEncodedName() + " : "
          + "Refreshing store files to see whether we can free up memstore");
    }

    long totalFreedDataSize = 0;

    long smallestSeqIdInStores = Long.MAX_VALUE;

    startRegionOperation(); // obtain region close lock
    try {
      Map map = new HashMap<>();
      synchronized (writestate) {
        for (HStore store : stores.values()) {
          // TODO: some stores might see new data from flush, while others do not which
          // MIGHT break atomic edits across column families.
          long maxSeqIdBefore = store.getMaxSequenceId().orElse(0L);

          // refresh the store files. This is similar to observing a region open wal marker.
          store.refreshStoreFiles();

          long storeSeqId = store.getMaxSequenceId().orElse(0L);
          if (storeSeqId < smallestSeqIdInStores) {
            smallestSeqIdInStores = storeSeqId;
          }

          // see whether we can drop the memstore or the snapshot
          if (storeSeqId > maxSeqIdBefore) {
            if (writestate.flushing) {
              // only drop memstore snapshots if they are smaller than last flush for the store
              if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) {
                StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
                    null : this.prepareFlushResult.storeFlushCtxs.get(
                            store.getColumnFamilyDescriptor().getName());
                if (ctx != null) {
                  MemStoreSize mss = store.getFlushableSize();
                  ctx.abort();
                  this.decrMemStoreSize(mss);
                  this.prepareFlushResult.storeFlushCtxs.
                      remove(store.getColumnFamilyDescriptor().getName());
                  totalFreedDataSize += mss.getDataSize();
                }
              }
            }

            map.put(store, storeSeqId);
          }
        }

        // if all stores ended up dropping their snapshots, we can safely drop the
        // prepareFlushResult
        dropPrepareFlushIfPossible();

        // advance the mvcc read point so that the new flushed files are visible.
        // either greater than flush seq number or they were already picked up via flush.
        for (HStore s : stores.values()) {
          mvcc.advanceTo(s.getMaxMemStoreTS().orElse(0L));
        }


        // smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely
        // skip all edits that are to be replayed in the future with that has a smaller seqId
        // than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits
        // that we have picked the flush files for
        if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) {
          this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores;
        }
      }
      if (!map.isEmpty()) {
        for (Map.Entry entry : map.entrySet()) {
          // Drop the memstore contents if they are now smaller than the latest seen flushed file
          totalFreedDataSize += dropMemStoreContentsForSeqId(entry.getValue(), entry.getKey())
              .getDataSize();
        }
      }
      // C. Finally notify anyone waiting on memstore to clear:
      // e.g. checkResources().
      synchronized (this) {
        notifyAll(); // FindBugs NN_NAKED_NOTIFY
      }
      return totalFreedDataSize > 0;
    } finally {
      closeRegionOperation();
    }
  }

  private void logRegionFiles() {
    if (LOG.isTraceEnabled()) {
      LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: ");
      stores.values().stream().filter(s -> s.getStorefiles() != null)
          .flatMap(s -> s.getStorefiles().stream())
          .forEachOrdered(sf -> LOG.trace(getRegionInfo().getEncodedName() + " : " + sf));
    }
  }

  /** Checks whether the given regionName is either equal to our region, or that
   * the regionName is the primary region to our corresponding range for the secondary replica.
   */
  private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload)
      throws WrongRegionException {
    if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) {
      return;
    }

    if (!RegionReplicaUtil.isDefaultReplica(this.getRegionInfo()) &&
        Bytes.equals(encodedRegionName,
          this.fs.getRegionInfoForFS().getEncodedNameAsBytes())) {
      return;
    }

    throw new WrongRegionException(exceptionMsg + payload
      + " targetted for region " + Bytes.toStringBinary(encodedRegionName)
      + " does not match this region: " + this.getRegionInfo());
  }

  /**
   * Used by tests
   * @param s Store to add edit too.
   * @param cell Cell to add.
   */
  @VisibleForTesting
  protected void restoreEdit(HStore s, Cell cell, MemStoreSizing memstoreAccounting) {
    s.add(cell, memstoreAccounting);
  }

  /**
   * @param p File to check.
   * @return True if file was zero-length (and if so, we'll delete it in here).
   * @throws IOException
   */
  private static boolean isZeroLengthThenDelete(final FileSystem fs, final FileStatus stat,
      final Path p) throws IOException {
    if (stat.getLen() > 0) {
      return false;
    }
    LOG.warn("File " + p + " is zero-length, deleting.");
    fs.delete(p, false);
    return true;
  }

  protected HStore instantiateHStore(final ColumnFamilyDescriptor family, boolean warmup)
      throws IOException {
    if (family.isMobEnabled()) {
      if (HFile.getFormatVersion(this.conf) < HFile.MIN_FORMAT_VERSION_WITH_TAGS) {
        throw new IOException("A minimum HFile version of " + HFile.MIN_FORMAT_VERSION_WITH_TAGS +
            " is required for MOB feature. Consider setting " + HFile.FORMAT_VERSION_KEY +
            " accordingly.");
      }
      return new HMobStore(this, family, this.conf, warmup);
    }
    return new HStore(this, family, this.conf, warmup);
  }

  @Override
  public HStore getStore(byte[] column) {
    return this.stores.get(column);
  }

  /**
   * Return HStore instance. Does not do any copy: as the number of store is limited, we iterate on
   * the list.
   */
  private HStore getStore(Cell cell) {
    return stores.entrySet().stream().filter(e -> CellUtil.matchingFamily(cell, e.getKey()))
        .map(e -> e.getValue()).findFirst().orElse(null);
  }

  @Override
  public List getStores() {
    return new ArrayList<>(stores.values());
  }

  @Override
  public List getStoreFileList(byte[][] columns) throws IllegalArgumentException {
    List storeFileNames = new ArrayList<>();
    synchronized (closeLock) {
      for (byte[] column : columns) {
        HStore store = this.stores.get(column);
        if (store == null) {
          throw new IllegalArgumentException(
              "No column family : " + new String(column, StandardCharsets.UTF_8) + " available");
        }
        Collection storeFiles = store.getStorefiles();
        if (storeFiles == null) {
          continue;
        }
        for (HStoreFile storeFile : storeFiles) {
          storeFileNames.add(storeFile.getPath().toString());
        }

        logRegionFiles();
      }
    }
    return storeFileNames;
  }

  //////////////////////////////////////////////////////////////////////////////
  // Support code
  //////////////////////////////////////////////////////////////////////////////

  /** Make sure this is a valid row for the HRegion */
  void checkRow(byte[] row, String op) throws IOException {
    if (!rowIsInRange(getRegionInfo(), row)) {
      throw new WrongRegionException("Requested row out of range for " +
          op + " on HRegion " + this + ", startKey='" +
          Bytes.toStringBinary(getRegionInfo().getStartKey()) + "', getEndKey()='" +
          Bytes.toStringBinary(getRegionInfo().getEndKey()) + "', row='" +
          Bytes.toStringBinary(row) + "'");
    }
  }


  /**
   * Get an exclusive ( write lock ) lock on a given row.
   * @param row Which row to lock.
   * @return A locked RowLock. The lock is exclusive and already aqquired.
   * @throws IOException
   */
  public RowLock getRowLock(byte[] row) throws IOException {
    return getRowLock(row, false);
  }

  @Override
  public RowLock getRowLock(byte[] row, boolean readLock) throws IOException {
    checkRow(row, "row lock");
    return getRowLockInternal(row, readLock, null);
  }

  protected RowLock getRowLockInternal(byte[] row, boolean readLock, final RowLock prevRowLock)
      throws IOException {
    // create an object to use a a key in the row lock map
    HashedBytes rowKey = new HashedBytes(row);

    RowLockContext rowLockContext = null;
    RowLockImpl result = null;

    boolean success = false;
    try (TraceScope scope = TraceUtil.createTrace("HRegion.getRowLock")) {
      TraceUtil.addTimelineAnnotation("Getting a " + (readLock?"readLock":"writeLock"));
      // Keep trying until we have a lock or error out.
      // TODO: do we need to add a time component here?
      while (result == null) {
        rowLockContext = computeIfAbsent(lockedRows, rowKey, () -> new RowLockContext(rowKey));
        // Now try an get the lock.
        // This can fail as
        if (readLock) {
          // For read lock, if the caller has locked the same row previously, it will not try
          // to acquire the same read lock. It simply returns the previous row lock.
          RowLockImpl prevRowLockImpl = (RowLockImpl)prevRowLock;
          if ((prevRowLockImpl != null) && (prevRowLockImpl.getLock() ==
              rowLockContext.readWriteLock.readLock())) {
            success = true;
            return prevRowLock;
          }
          result = rowLockContext.newReadLock();
        } else {
          result = rowLockContext.newWriteLock();
        }
      }

      int timeout = rowLockWaitDuration;
      boolean reachDeadlineFirst = false;
      Optional call = RpcServer.getCurrentCall();
      if (call.isPresent()) {
        long deadline = call.get().getDeadline();
        if (deadline < Long.MAX_VALUE) {
          int timeToDeadline = (int) (deadline - System.currentTimeMillis());
          if (timeToDeadline <= this.rowLockWaitDuration) {
            reachDeadlineFirst = true;
            timeout = timeToDeadline;
          }
        }
      }

      if (timeout <= 0 || !result.getLock().tryLock(timeout, TimeUnit.MILLISECONDS)) {
        TraceUtil.addTimelineAnnotation("Failed to get row lock");
        String message = "Timed out waiting for lock for row: " + rowKey + " in region "
            + getRegionInfo().getEncodedName();
        if (reachDeadlineFirst) {
          throw new TimeoutIOException(message);
        } else {
          // If timeToDeadline is larger than rowLockWaitDuration, we can not drop the request.
          throw new IOException(message);
        }
      }
      rowLockContext.setThreadName(Thread.currentThread().getName());
      success = true;
      return result;
    } catch (InterruptedException ie) {
      LOG.warn("Thread interrupted waiting for lock on row: {}, in region {}", rowKey,
        getRegionInfo().getRegionNameAsString());
      InterruptedIOException iie = new InterruptedIOException();
      iie.initCause(ie);
      TraceUtil.addTimelineAnnotation("Interrupted exception getting row lock");
      Thread.currentThread().interrupt();
      throw iie;
    } catch (Error error) {
      // The maximum lock count for read lock is 64K (hardcoded), when this maximum count
      // is reached, it will throw out an Error. This Error needs to be caught so it can
      // go ahead to process the minibatch with lock acquired.
      LOG.warn("Error to get row lock for {}, in region {}, cause: {}", Bytes.toStringBinary(row),
        getRegionInfo().getRegionNameAsString(), error);
      IOException ioe = new IOException(error);
      TraceUtil.addTimelineAnnotation("Error getting row lock");
      throw ioe;
    } finally {
      // Clean up the counts just in case this was the thing keeping the context alive.
      if (!success && rowLockContext != null) {
        rowLockContext.cleanUp();
      }
    }
  }

  private void releaseRowLocks(List rowLocks) {
    if (rowLocks != null) {
      for (RowLock rowLock : rowLocks) {
        rowLock.release();
      }
      rowLocks.clear();
    }
  }

  @VisibleForTesting
  public int getReadLockCount() {
    return lock.getReadLockCount();
  }

  public ConcurrentHashMap getLockedRows() {
    return lockedRows;
  }

  @VisibleForTesting
  class RowLockContext {
    private final HashedBytes row;
    final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(true);
    final AtomicBoolean usable = new AtomicBoolean(true);
    final AtomicInteger count = new AtomicInteger(0);
    final Object lock = new Object();
    private String threadName;

    RowLockContext(HashedBytes row) {
      this.row = row;
    }

    RowLockImpl newWriteLock() {
      Lock l = readWriteLock.writeLock();
      return getRowLock(l);
    }
    RowLockImpl newReadLock() {
      Lock l = readWriteLock.readLock();
      return getRowLock(l);
    }

    private RowLockImpl getRowLock(Lock l) {
      count.incrementAndGet();
      synchronized (lock) {
        if (usable.get()) {
          return new RowLockImpl(this, l);
        } else {
          return null;
        }
      }
    }

    void cleanUp() {
      long c = count.decrementAndGet();
      if (c <= 0) {
        synchronized (lock) {
          if (count.get() <= 0 && usable.get()){ // Don't attempt to remove row if already removed
            usable.set(false);
            RowLockContext removed = lockedRows.remove(row);
            assert removed == this: "we should never remove a different context";
          }
        }
      }
    }

    public void setThreadName(String threadName) {
      this.threadName = threadName;
    }

    @Override
    public String toString() {
      return "RowLockContext{" +
          "row=" + row +
          ", readWriteLock=" + readWriteLock +
          ", count=" + count +
          ", threadName=" + threadName +
          '}';
    }
  }

  /**
   * Class used to represent a lock on a row.
   */
  public static class RowLockImpl implements RowLock {
    private final RowLockContext context;
    private final Lock lock;

    public RowLockImpl(RowLockContext context, Lock lock) {
      this.context = context;
      this.lock = lock;
    }

    public Lock getLock() {
      return lock;
    }

    @VisibleForTesting
    public RowLockContext getContext() {
      return context;
    }

    @Override
    public void release() {
      lock.unlock();
      context.cleanUp();
    }

    @Override
    public String toString() {
      return "RowLockImpl{" +
          "context=" + context +
          ", lock=" + lock +
          '}';
    }
  }

  /**
   * Determines whether multiple column families are present
   * Precondition: familyPaths is not null
   *
   * @param familyPaths List of (column family, hfilePath)
   */
  private static boolean hasMultipleColumnFamilies(Collection> familyPaths) {
    boolean multipleFamilies = false;
    byte[] family = null;
    for (Pair pair : familyPaths) {
      byte[] fam = pair.getFirst();
      if (family == null) {
        family = fam;
      } else if (!Bytes.equals(family, fam)) {
        multipleFamilies = true;
        break;
      }
    }
    return multipleFamilies;
  }

  /**
   * Attempts to atomically load a group of hfiles.  This is critical for loading
   * rows with multiple column families atomically.
   *
   * @param familyPaths List of Pair<byte[] column family, String hfilePath>
   * @param bulkLoadListener Internal hooks enabling massaging/preparation of a
   * file about to be bulk loaded
   * @param assignSeqId
   * @return Map from family to List of store file paths if successful, null if failed recoverably
   * @throws IOException if failed unrecoverably.
   */
  public Map> bulkLoadHFiles(Collection> familyPaths, boolean assignSeqId,
      BulkLoadListener bulkLoadListener) throws IOException {
    return bulkLoadHFiles(familyPaths, assignSeqId, bulkLoadListener, false,
      null, true);
  }

  /**
   * Listener class to enable callers of
   * bulkLoadHFile() to perform any necessary
   * pre/post processing of a given bulkload call
   */
  public interface BulkLoadListener {
    /**
     * Called before an HFile is actually loaded
     * @param family family being loaded to
     * @param srcPath path of HFile
     * @return final path to be used for actual loading
     * @throws IOException
     */
    String prepareBulkLoad(byte[] family, String srcPath, boolean copyFile)
        throws IOException;

    /**
     * Called after a successful HFile load
     * @param family family being loaded to
     * @param srcPath path of HFile
     * @throws IOException
     */
    void doneBulkLoad(byte[] family, String srcPath) throws IOException;

    /**
     * Called after a failed HFile load
     * @param family family being loaded to
     * @param srcPath path of HFile
     * @throws IOException
     */
    void failedBulkLoad(byte[] family, String srcPath) throws IOException;
  }

  /**
   * Attempts to atomically load a group of hfiles.  This is critical for loading
   * rows with multiple column families atomically.
   *
   * @param familyPaths List of Pair<byte[] column family, String hfilePath>
   * @param assignSeqId
   * @param bulkLoadListener Internal hooks enabling massaging/preparation of a
   * file about to be bulk loaded
   * @param copyFile always copy hfiles if true
   * @param  clusterIds ids from clusters that had already handled the given bulkload event.
   * @return Map from family to List of store file paths if successful, null if failed recoverably
   * @throws IOException if failed unrecoverably.
   */
  public Map> bulkLoadHFiles(Collection> familyPaths,
      boolean assignSeqId, BulkLoadListener bulkLoadListener, boolean copyFile,
      List clusterIds, boolean replicate) throws IOException {
    long seqId = -1;
    Map> storeFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR);
    Map storeFilesSizes = new HashMap<>();
    Preconditions.checkNotNull(familyPaths);
    // we need writeLock for multi-family bulk load
    startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths));
    boolean isSuccessful = false;
    try {
      this.writeRequestsCount.increment();

      // There possibly was a split that happened between when the split keys
      // were gathered and before the HRegion's write lock was taken. We need
      // to validate the HFile region before attempting to bulk load all of them
      IOException ioException = null;
      List> failures = new ArrayList<>();
      for (Pair p : familyPaths) {
        byte[] familyName = p.getFirst();
        String path = p.getSecond();

        HStore store = getStore(familyName);
        if (store == null) {
          ioException = new org.apache.hadoop.hbase.DoNotRetryIOException(
              "No such column family " + Bytes.toStringBinary(familyName));
        } else {
          try {
            store.assertBulkLoadHFileOk(new Path(path));
          } catch (WrongRegionException wre) {
            // recoverable (file doesn't fit in region)
            failures.add(p);
          } catch (IOException ioe) {
            // unrecoverable (hdfs problem)
            ioException = ioe;
          }
        }

        // validation failed because of some sort of IO problem.
        if (ioException != null) {
          LOG.error("There was IO error when checking if the bulk load is ok in region {}.", this,
            ioException);
          throw ioException;
        }
      }
      // validation failed, bail out before doing anything permanent.
      if (failures.size() != 0) {
        StringBuilder list = new StringBuilder();
        for (Pair p : failures) {
          list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ")
              .append(p.getSecond());
        }
        // problem when validating
        LOG.warn("There was a recoverable bulk load failure likely due to a split. These (family,"
          + " HFile) pairs were not loaded: {}, in region {}", list.toString(), this);
        return null;
      }

      // We need to assign a sequential ID that's in between two memstores in order to preserve
      // the guarantee that all the edits lower than the highest sequential ID from all the
      // HFiles are flushed on disk. See HBASE-10958.  The sequence id returned when we flush is
      // guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is
      // a sequence id that we can be sure is beyond the last hfile written).
      if (assignSeqId) {
        FlushResult fs = flushcache(true, false, FlushLifeCycleTracker.DUMMY);
        if (fs.isFlushSucceeded()) {
          seqId = ((FlushResultImpl)fs).flushSequenceId;
        } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
          seqId = ((FlushResultImpl)fs).flushSequenceId;
        } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH) {
          // CANNOT_FLUSH may mean that a flush is already on-going
          // we need to wait for that flush to complete
          waitForFlushes();
        } else {
          throw new IOException("Could not bulk load with an assigned sequential ID because the "+
            "flush didn't run. Reason for not flushing: " + ((FlushResultImpl)fs).failureReason);
        }
      }

      Map>> familyWithFinalPath =
          new TreeMap<>(Bytes.BYTES_COMPARATOR);
      for (Pair p : familyPaths) {
        byte[] familyName = p.getFirst();
        String path = p.getSecond();
        HStore store = getStore(familyName);
        if (!familyWithFinalPath.containsKey(familyName)) {
          familyWithFinalPath.put(familyName, new ArrayList<>());
        }
        List> lst = familyWithFinalPath.get(familyName);
        try {
          String finalPath = path;
          if (bulkLoadListener != null) {
            finalPath = bulkLoadListener.prepareBulkLoad(familyName, path, copyFile);
          }
          Pair pair = store.preBulkLoadHFile(finalPath, seqId);
          lst.add(pair);
        } catch (IOException ioe) {
          // A failure here can cause an atomicity violation that we currently
          // cannot recover from since it is likely a failed HDFS operation.

          LOG.error("There was a partial failure due to IO when attempting to" +
              " load " + Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe);
          if (bulkLoadListener != null) {
            try {
              bulkLoadListener.failedBulkLoad(familyName, path);
            } catch (Exception ex) {
              LOG.error("Error while calling failedBulkLoad for family " +
                  Bytes.toString(familyName) + " with path " + path, ex);
            }
          }
          throw ioe;
        }
      }

      if (this.getCoprocessorHost() != null) {
        for (Map.Entry>> entry : familyWithFinalPath.entrySet()) {
          this.getCoprocessorHost().preCommitStoreFile(entry.getKey(), entry.getValue());
        }
      }
      for (Map.Entry>> entry : familyWithFinalPath.entrySet()) {
        byte[] familyName = entry.getKey();
        for (Pair p : entry.getValue()) {
          String path = p.getFirst().toString();
          Path commitedStoreFile = p.getSecond();
          HStore store = getStore(familyName);
          try {
            store.bulkLoadHFile(familyName, path, commitedStoreFile);
            // Note the size of the store file
            try {
              FileSystem fs = commitedStoreFile.getFileSystem(baseConf);
              storeFilesSizes.put(commitedStoreFile.getName(), fs.getFileStatus(commitedStoreFile)
                  .getLen());
            } catch (IOException e) {
              LOG.warn("Failed to find the size of hfile " + commitedStoreFile, e);
              storeFilesSizes.put(commitedStoreFile.getName(), 0L);
            }

            if(storeFiles.containsKey(familyName)) {
              storeFiles.get(familyName).add(commitedStoreFile);
            } else {
              List storeFileNames = new ArrayList<>();
              storeFileNames.add(commitedStoreFile);
              storeFiles.put(familyName, storeFileNames);
            }
            if (bulkLoadListener != null) {
              bulkLoadListener.doneBulkLoad(familyName, path);
            }
          } catch (IOException ioe) {
            // A failure here can cause an atomicity violation that we currently
            // cannot recover from since it is likely a failed HDFS operation.

            // TODO Need a better story for reverting partial failures due to HDFS.
            LOG.error("There was a partial failure due to IO when attempting to" +
                " load " + Bytes.toString(familyName) + " : " + p.getSecond(), ioe);
            if (bulkLoadListener != null) {
              try {
                bulkLoadListener.failedBulkLoad(familyName, path);
              } catch (Exception ex) {
                LOG.error("Error while calling failedBulkLoad for family " +
                    Bytes.toString(familyName) + " with path " + path, ex);
              }
            }
            throw ioe;
          }
        }
      }

      isSuccessful = true;
    } finally {
      if (wal != null && !storeFiles.isEmpty()) {
        // Write a bulk load event for hfiles that are loaded
        try {
          WALProtos.BulkLoadDescriptor loadDescriptor =
              ProtobufUtil.toBulkLoadDescriptor(this.getRegionInfo().getTable(),
                  UnsafeByteOperations.unsafeWrap(this.getRegionInfo().getEncodedNameAsBytes()),
                  storeFiles, storeFilesSizes, seqId, clusterIds, replicate);
          WALUtil.writeBulkLoadMarkerAndSync(this.wal, this.getReplicationScope(), getRegionInfo(),
              loadDescriptor, mvcc);
        } catch (IOException ioe) {
          if (this.rsServices != null) {
            // Have to abort region server because some hfiles has been loaded but we can't write
            // the event into WAL
            isSuccessful = false;
            this.rsServices.abort("Failed to write bulk load event into WAL.", ioe);
          }
        }
      }

      closeBulkRegionOperation();
    }
    return isSuccessful ? storeFiles : null;
  }

  @Override
  public boolean equals(Object o) {
    return o instanceof HRegion && Bytes.equals(getRegionInfo().getRegionName(),
                                                ((HRegion) o).getRegionInfo().getRegionName());
  }

  @Override
  public int hashCode() {
    return Bytes.hashCode(getRegionInfo().getRegionName());
  }

  @Override
  public String toString() {
    return getRegionInfo().getRegionNameAsString();
  }

  /**
   * RegionScannerImpl is used to combine scanners from multiple Stores (aka column families).
   */
  class RegionScannerImpl
      implements RegionScanner, Shipper, org.apache.hadoop.hbase.ipc.RpcCallback {
    // Package local for testability
    KeyValueHeap storeHeap = null;
    /** Heap of key-values that are not essential for the provided filters and are thus read
     * on demand, if on-demand column family loading is enabled.*/
    KeyValueHeap joinedHeap = null;
    /**
     * If the joined heap data gathering is interrupted due to scan limits, this will
     * contain the row for which we are populating the values.*/
    protected Cell joinedContinuationRow = null;
    private boolean filterClosed = false;

    protected final byte[] stopRow;
    protected final boolean includeStopRow;
    protected final HRegion region;
    protected final CellComparator comparator;

    private final long readPt;
    private final long maxResultSize;
    private final ScannerContext defaultScannerContext;
    private final FilterWrapper filter;

    @Override
    public RegionInfo getRegionInfo() {
      return region.getRegionInfo();
    }

    RegionScannerImpl(Scan scan, List additionalScanners, HRegion region)
        throws IOException {
      this(scan, additionalScanners, region, HConstants.NO_NONCE, HConstants.NO_NONCE);
    }

    RegionScannerImpl(Scan scan, List additionalScanners, HRegion region,
        long nonceGroup, long nonce) throws IOException {
      this.region = region;
      this.maxResultSize = scan.getMaxResultSize();
      if (scan.hasFilter()) {
        this.filter = new FilterWrapper(scan.getFilter());
      } else {
        this.filter = null;
      }
      this.comparator = region.getCellComparator();
      /**
       * By default, calls to next/nextRaw must enforce the batch limit. Thus, construct a default
       * scanner context that can be used to enforce the batch limit in the event that a
       * ScannerContext is not specified during an invocation of next/nextRaw
       */
      defaultScannerContext = ScannerContext.newBuilder()
          .setBatchLimit(scan.getBatch()).build();
      this.stopRow = scan.getStopRow();
      this.includeStopRow = scan.includeStopRow();

      // synchronize on scannerReadPoints so that nobody calculates
      // getSmallestReadPoint, before scannerReadPoints is updated.
      IsolationLevel isolationLevel = scan.getIsolationLevel();
      long mvccReadPoint = PackagePrivateFieldAccessor.getMvccReadPoint(scan);
      synchronized (scannerReadPoints) {
        if (mvccReadPoint > 0) {
          this.readPt = mvccReadPoint;
        } else if (nonce == HConstants.NO_NONCE || rsServices == null
            || rsServices.getNonceManager() == null) {
          this.readPt = getReadPoint(isolationLevel);
        } else {
          this.readPt = rsServices.getNonceManager().getMvccFromOperationContext(nonceGroup, nonce);
        }
        scannerReadPoints.put(this, this.readPt);
      }
      initializeScanners(scan, additionalScanners);
    }

    protected void initializeScanners(Scan scan, List additionalScanners)
        throws IOException {
      // Here we separate all scanners into two lists - scanner that provide data required
      // by the filter to operate (scanners list) and all others (joinedScanners list).
      List scanners = new ArrayList<>(scan.getFamilyMap().size());
      List joinedScanners = new ArrayList<>(scan.getFamilyMap().size());
      // Store all already instantiated scanners for exception handling
      List instantiatedScanners = new ArrayList<>();
      // handle additionalScanners
      if (additionalScanners != null && !additionalScanners.isEmpty()) {
        scanners.addAll(additionalScanners);
        instantiatedScanners.addAll(additionalScanners);
      }

      try {
        for (Map.Entry> entry : scan.getFamilyMap().entrySet()) {
          HStore store = stores.get(entry.getKey());
          KeyValueScanner scanner = store.getScanner(scan, entry.getValue(), this.readPt);
          instantiatedScanners.add(scanner);
          if (this.filter == null || !scan.doLoadColumnFamiliesOnDemand()
              || this.filter.isFamilyEssential(entry.getKey())) {
            scanners.add(scanner);
          } else {
            joinedScanners.add(scanner);
          }
        }
        initializeKVHeap(scanners, joinedScanners, region);
      } catch (Throwable t) {
        throw handleException(instantiatedScanners, t);
      }
    }

    protected void initializeKVHeap(List scanners,
        List joinedScanners, HRegion region)
        throws IOException {
      this.storeHeap = new KeyValueHeap(scanners, comparator);
      if (!joinedScanners.isEmpty()) {
        this.joinedHeap = new KeyValueHeap(joinedScanners, comparator);
      }
    }

    private IOException handleException(List instantiatedScanners,
        Throwable t) {
      // remove scaner read point before throw the exception
      scannerReadPoints.remove(this);
      if (storeHeap != null) {
        storeHeap.close();
        storeHeap = null;
        if (joinedHeap != null) {
          joinedHeap.close();
          joinedHeap = null;
        }
      } else {
        // close all already instantiated scanners before throwing the exception
        for (KeyValueScanner scanner : instantiatedScanners) {
          scanner.close();
        }
      }
      return t instanceof IOException ? (IOException) t : new IOException(t);
    }

    @Override
    public long getMaxResultSize() {
      return maxResultSize;
    }

    @Override
    public long getMvccReadPoint() {
      return this.readPt;
    }

    @Override
    public int getBatch() {
      return this.defaultScannerContext.getBatchLimit();
    }

    /**
     * Reset both the filter and the old filter.
     *
     * @throws IOException in case a filter raises an I/O exception.
     */
    protected void resetFilters() throws IOException {
      if (filter != null) {
        filter.reset();
      }
    }

    @Override
    public boolean next(List outResults)
        throws IOException {
      // apply the batching limit by default
      return next(outResults, defaultScannerContext);
    }

    @Override
    public synchronized boolean next(List outResults, ScannerContext scannerContext)
    throws IOException {
      if (this.filterClosed) {
        throw new UnknownScannerException("Scanner was closed (timed out?) " +
            "after we renewed it. Could be caused by a very slow scanner " +
            "or a lengthy garbage collection");
      }
      startRegionOperation(Operation.SCAN);
      try {
        return nextRaw(outResults, scannerContext);
      } finally {
        closeRegionOperation(Operation.SCAN);
      }
    }

    @Override
    public boolean nextRaw(List outResults) throws IOException {
      // Use the RegionScanner's context by default
      return nextRaw(outResults, defaultScannerContext);
    }

    @Override
    public boolean nextRaw(List outResults, ScannerContext scannerContext)
        throws IOException {
      if (storeHeap == null) {
        // scanner is closed
        throw new UnknownScannerException("Scanner was closed");
      }
      boolean moreValues = false;
      if (outResults.isEmpty()) {
        // Usually outResults is empty. This is true when next is called
        // to handle scan or get operation.
        moreValues = nextInternal(outResults, scannerContext);
      } else {
        List tmpList = new ArrayList<>();
        moreValues = nextInternal(tmpList, scannerContext);
        outResults.addAll(tmpList);
      }

      readRequestsCount.increment();
      if (metricsRegion != null) {
        metricsRegion.updateReadRequestCount();
      }

      // If the size limit was reached it means a partial Result is being returned. Returning a
      // partial Result means that we should not reset the filters; filters should only be reset in
      // between rows
      if (!scannerContext.mayHaveMoreCellsInRow()) {
        resetFilters();
      }

      if (isFilterDoneInternal()) {
        moreValues = false;
      }
      return moreValues;
    }

    /**
     * @return true if more cells exist after this batch, false if scanner is done
     */
    private boolean populateFromJoinedHeap(List results, ScannerContext scannerContext)
            throws IOException {
      assert joinedContinuationRow != null;
      boolean moreValues = populateResult(results, this.joinedHeap, scannerContext,
          joinedContinuationRow);

      if (!scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
        // We are done with this row, reset the continuation.
        joinedContinuationRow = null;
      }
      // As the data is obtained from two independent heaps, we need to
      // ensure that result list is sorted, because Result relies on that.
      sort(results, comparator);
      return moreValues;
    }

    /**
     * Fetches records with currentRow into results list, until next row, batchLimit (if not -1) is
     * reached, or remainingResultSize (if not -1) is reaced
     * @param heap KeyValueHeap to fetch data from.It must be positioned on correct row before call.
     * @param scannerContext
     * @param currentRowCell
     * @return state of last call to {@link KeyValueHeap#next()}
     */
    private boolean populateResult(List results, KeyValueHeap heap,
        ScannerContext scannerContext, Cell currentRowCell) throws IOException {
      Cell nextKv;
      boolean moreCellsInRow = false;
      boolean tmpKeepProgress = scannerContext.getKeepProgress();
      // Scanning between column families and thus the scope is between cells
      LimitScope limitScope = LimitScope.BETWEEN_CELLS;
      do {
        // We want to maintain any progress that is made towards the limits while scanning across
        // different column families. To do this, we toggle the keep progress flag on during calls
        // to the StoreScanner to ensure that any progress made thus far is not wiped away.
        scannerContext.setKeepProgress(true);
        heap.next(results, scannerContext);
        scannerContext.setKeepProgress(tmpKeepProgress);

        nextKv = heap.peek();
        moreCellsInRow = moreCellsInRow(nextKv, currentRowCell);
        if (!moreCellsInRow) incrementCountOfRowsScannedMetric(scannerContext);
        if (moreCellsInRow && scannerContext.checkBatchLimit(limitScope)) {
          return scannerContext.setScannerState(NextState.BATCH_LIMIT_REACHED).hasMoreValues();
        } else if (scannerContext.checkSizeLimit(limitScope)) {
          ScannerContext.NextState state =
              moreCellsInRow ? NextState.SIZE_LIMIT_REACHED_MID_ROW : NextState.SIZE_LIMIT_REACHED;
          return scannerContext.setScannerState(state).hasMoreValues();
        } else if (scannerContext.checkTimeLimit(limitScope)) {
          ScannerContext.NextState state =
              moreCellsInRow ? NextState.TIME_LIMIT_REACHED_MID_ROW : NextState.TIME_LIMIT_REACHED;
          return scannerContext.setScannerState(state).hasMoreValues();
        }
      } while (moreCellsInRow);
      return nextKv != null;
    }

    /**
     * Based on the nextKv in the heap, and the current row, decide whether or not there are more
     * cells to be read in the heap. If the row of the nextKv in the heap matches the current row
     * then there are more cells to be read in the row.
     * @param nextKv
     * @param currentRowCell
     * @return true When there are more cells in the row to be read
     */
    private boolean moreCellsInRow(final Cell nextKv, Cell currentRowCell) {
      return nextKv != null && CellUtil.matchingRows(nextKv, currentRowCell);
    }

    /*
     * @return True if a filter rules the scanner is over, done.
     */
    @Override
    public synchronized boolean isFilterDone() throws IOException {
      return isFilterDoneInternal();
    }

    private boolean isFilterDoneInternal() throws IOException {
      return this.filter != null && this.filter.filterAllRemaining();
    }

    private boolean nextInternal(List results, ScannerContext scannerContext)
        throws IOException {
      if (!results.isEmpty()) {
        throw new IllegalArgumentException("First parameter should be an empty list");
      }
      if (scannerContext == null) {
        throw new IllegalArgumentException("Scanner context cannot be null");
      }
      Optional rpcCall = RpcServer.getCurrentCall();

      // Save the initial progress from the Scanner context in these local variables. The progress
      // may need to be reset a few times if rows are being filtered out so we save the initial
      // progress.
      int initialBatchProgress = scannerContext.getBatchProgress();
      long initialSizeProgress = scannerContext.getDataSizeProgress();
      long initialHeapSizeProgress = scannerContext.getHeapSizeProgress();

      // Used to check time limit
      LimitScope limitScope = LimitScope.BETWEEN_CELLS;

      // The loop here is used only when at some point during the next we determine
      // that due to effects of filters or otherwise, we have an empty row in the result.
      // Then we loop and try again. Otherwise, we must get out on the first iteration via return,
      // "true" if there's more data to read, "false" if there isn't (storeHeap is at a stop row,
      // and joinedHeap has no more data to read for the last row (if set, joinedContinuationRow).
      while (true) {
        // Starting to scan a new row. Reset the scanner progress according to whether or not
        // progress should be kept.
        if (scannerContext.getKeepProgress()) {
          // Progress should be kept. Reset to initial values seen at start of method invocation.
          scannerContext.setProgress(initialBatchProgress, initialSizeProgress,
              initialHeapSizeProgress);
        } else {
          scannerContext.clearProgress();
        }
        if (rpcCall.isPresent()) {
          // If a user specifies a too-restrictive or too-slow scanner, the
          // client might time out and disconnect while the server side
          // is still processing the request. We should abort aggressively
          // in that case.
          long afterTime = rpcCall.get().disconnectSince();
          if (afterTime >= 0) {
            throw new CallerDisconnectedException(
                "Aborting on region " + getRegionInfo().getRegionNameAsString() + ", call " +
                    this + " after " + afterTime + " ms, since " +
                    "caller disconnected");
          }
        }

        // Let's see what we have in the storeHeap.
        Cell current = this.storeHeap.peek();

        boolean shouldStop = shouldStop(current);
        // When has filter row is true it means that the all the cells for a particular row must be
        // read before a filtering decision can be made. This means that filters where hasFilterRow
        // run the risk of enLongAddering out of memory errors in the case that they are applied to a
        // table that has very large rows.
        boolean hasFilterRow = this.filter != null && this.filter.hasFilterRow();

        // If filter#hasFilterRow is true, partial results are not allowed since allowing them
        // would prevent the filters from being evaluated. Thus, if it is true, change the
        // scope of any limits that could potentially create partial results to
        // LimitScope.BETWEEN_ROWS so that those limits are not reached mid-row
        if (hasFilterRow) {
          if (LOG.isTraceEnabled()) {
            LOG.trace("filter#hasFilterRow is true which prevents partial results from being "
                + " formed. Changing scope of limits that may create partials");
          }
          scannerContext.setSizeLimitScope(LimitScope.BETWEEN_ROWS);
          scannerContext.setTimeLimitScope(LimitScope.BETWEEN_ROWS);
          limitScope = LimitScope.BETWEEN_ROWS;
        }

        if (scannerContext.checkTimeLimit(LimitScope.BETWEEN_CELLS)) {
          if (hasFilterRow) {
            throw new IncompatibleFilterException(
                "Filter whose hasFilterRow() returns true is incompatible with scans that must " +
                    " stop mid-row because of a limit. ScannerContext:" + scannerContext);
          }
          return true;
        }

        // Check if we were getting data from the joinedHeap and hit the limit.
        // If not, then it's main path - getting results from storeHeap.
        if (joinedContinuationRow == null) {
          // First, check if we are at a stop row. If so, there are no more results.
          if (shouldStop) {
            if (hasFilterRow) {
              filter.filterRowCells(results);
            }
            return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
          }

          // Check if rowkey filter wants to exclude this row. If so, loop to next.
          // Technically, if we hit limits before on this row, we don't need this call.
          if (filterRowKey(current)) {
            incrementCountOfRowsFilteredMetric(scannerContext);
            // early check, see HBASE-16296
            if (isFilterDoneInternal()) {
              return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
            }
            // Typically the count of rows scanned is incremented inside #populateResult. However,
            // here we are filtering a row based purely on its row key, preventing us from calling
            // #populateResult. Thus, perform the necessary increment here to rows scanned metric
            incrementCountOfRowsScannedMetric(scannerContext);
            boolean moreRows = nextRow(scannerContext, current);
            if (!moreRows) {
              return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
            }
            results.clear();

            // Read nothing as the rowkey was filtered, but still need to check time limit
            if (scannerContext.checkTimeLimit(limitScope)) {
              return true;
            }
            continue;
          }

          // Ok, we are good, let's try to get some results from the main heap.
          populateResult(results, this.storeHeap, scannerContext, current);
          if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
            if (hasFilterRow) {
              throw new IncompatibleFilterException(
                  "Filter whose hasFilterRow() returns true is incompatible with scans that must "
                      + " stop mid-row because of a limit. ScannerContext:" + scannerContext);
            }
            return true;
          }

          Cell nextKv = this.storeHeap.peek();
          shouldStop = shouldStop(nextKv);
          // save that the row was empty before filters applied to it.
          final boolean isEmptyRow = results.isEmpty();

          // We have the part of the row necessary for filtering (all of it, usually).
          // First filter with the filterRow(List).
          FilterWrapper.FilterRowRetCode ret = FilterWrapper.FilterRowRetCode.NOT_CALLED;
          if (hasFilterRow) {
            ret = filter.filterRowCellsWithRet(results);

            // We don't know how the results have changed after being filtered. Must set progress
            // according to contents of results now.
            if (scannerContext.getKeepProgress()) {
              scannerContext.setProgress(initialBatchProgress, initialSizeProgress,
                  initialHeapSizeProgress);
            } else {
              scannerContext.clearProgress();
            }
            scannerContext.incrementBatchProgress(results.size());
            for (Cell cell : results) {
              scannerContext.incrementSizeProgress(PrivateCellUtil.estimatedSerializedSizeOf(cell),
                cell.heapSize());
            }
          }

          if (isEmptyRow || ret == FilterWrapper.FilterRowRetCode.EXCLUDE || filterRow()) {
            incrementCountOfRowsFilteredMetric(scannerContext);
            results.clear();
            boolean moreRows = nextRow(scannerContext, current);
            if (!moreRows) {
              return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
            }

            // This row was totally filtered out, if this is NOT the last row,
            // we should continue on. Otherwise, nothing else to do.
            if (!shouldStop) {
              // Read nothing as the cells was filtered, but still need to check time limit
              if (scannerContext.checkTimeLimit(limitScope)) {
                return true;
              }
              continue;
            }
            return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
          }

          // Ok, we are done with storeHeap for this row.
          // Now we may need to fetch additional, non-essential data into row.
          // These values are not needed for filter to work, so we postpone their
          // fetch to (possibly) reduce amount of data loads from disk.
          if (this.joinedHeap != null) {
            boolean mayHaveData = joinedHeapMayHaveData(current);
            if (mayHaveData) {
              joinedContinuationRow = current;
              populateFromJoinedHeap(results, scannerContext);

              if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
                return true;
              }
            }
          }
        } else {
          // Populating from the joined heap was stopped by limits, populate some more.
          populateFromJoinedHeap(results, scannerContext);
          if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
            return true;
          }
        }
        // We may have just called populateFromJoinedMap and hit the limits. If that is
        // the case, we need to call it again on the next next() invocation.
        if (joinedContinuationRow != null) {
          return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
        }

        // Finally, we are done with both joinedHeap and storeHeap.
        // Double check to prevent empty rows from appearing in result. It could be
        // the case when SingleColumnValueExcludeFilter is used.
        if (results.isEmpty()) {
          incrementCountOfRowsFilteredMetric(scannerContext);
          boolean moreRows = nextRow(scannerContext, current);
          if (!moreRows) {
            return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
          }
          if (!shouldStop) continue;
        }

        if (shouldStop) {
          return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
        } else {
          return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
        }
      }
    }

    protected void incrementCountOfRowsFilteredMetric(ScannerContext scannerContext) {
      filteredReadRequestsCount.increment();
      if (metricsRegion != null) {
        metricsRegion.updateFilteredRecords();
      }

      if (scannerContext == null || !scannerContext.isTrackingMetrics()) return;

      scannerContext.getMetrics().countOfRowsFiltered.incrementAndGet();
    }

    protected void incrementCountOfRowsScannedMetric(ScannerContext scannerContext) {
      if (scannerContext == null || !scannerContext.isTrackingMetrics()) return;

      scannerContext.getMetrics().countOfRowsScanned.incrementAndGet();
    }

    /**
     * @param currentRowCell
     * @return true when the joined heap may have data for the current row
     * @throws IOException
     */
    private boolean joinedHeapMayHaveData(Cell currentRowCell)
        throws IOException {
      Cell nextJoinedKv = joinedHeap.peek();
      boolean matchCurrentRow =
          nextJoinedKv != null && CellUtil.matchingRows(nextJoinedKv, currentRowCell);
      boolean matchAfterSeek = false;

      // If the next value in the joined heap does not match the current row, try to seek to the
      // correct row
      if (!matchCurrentRow) {
        Cell firstOnCurrentRow = PrivateCellUtil.createFirstOnRow(currentRowCell);
        boolean seekSuccessful = this.joinedHeap.requestSeek(firstOnCurrentRow, true, true);
        matchAfterSeek =
            seekSuccessful && joinedHeap.peek() != null
                && CellUtil.matchingRows(joinedHeap.peek(), currentRowCell);
      }

      return matchCurrentRow || matchAfterSeek;
    }

    /**
     * This function is to maintain backward compatibility for 0.94 filters. HBASE-6429 combines
     * both filterRow & filterRow({@code List kvs}) functions. While 0.94 code or older,
     * it may not implement hasFilterRow as HBase-6429 expects because 0.94 hasFilterRow() only
     * returns true when filterRow({@code List kvs}) is overridden not the filterRow().
     * Therefore, the filterRow() will be skipped.
     */
    private boolean filterRow() throws IOException {
      // when hasFilterRow returns true, filter.filterRow() will be called automatically inside
      // filterRowCells(List kvs) so we skip that scenario here.
      return filter != null && (!filter.hasFilterRow())
          && filter.filterRow();
    }

    private boolean filterRowKey(Cell current) throws IOException {
      return filter != null && filter.filterRowKey(current);
    }

    protected boolean nextRow(ScannerContext scannerContext, Cell curRowCell) throws IOException {
      assert this.joinedContinuationRow == null: "Trying to go to next row during joinedHeap read.";
      Cell next;
      while ((next = this.storeHeap.peek()) != null &&
             CellUtil.matchingRows(next, curRowCell)) {
        this.storeHeap.next(MOCKED_LIST);
      }
      resetFilters();

      // Calling the hook in CP which allows it to do a fast forward
      return this.region.getCoprocessorHost() == null
          || this.region.getCoprocessorHost()
              .postScannerFilterRow(this, curRowCell);
    }

    protected boolean shouldStop(Cell currentRowCell) {
      if (currentRowCell == null) {
        return true;
      }
      if (stopRow == null || Bytes.equals(stopRow, HConstants.EMPTY_END_ROW)) {
        return false;
      }
      int c = comparator.compareRows(currentRowCell, stopRow, 0, stopRow.length);
      return c > 0 || (c == 0 && !includeStopRow);
    }

    @Override
    public synchronized void close() {
      if (storeHeap != null) {
        storeHeap.close();
        storeHeap = null;
      }
      if (joinedHeap != null) {
        joinedHeap.close();
        joinedHeap = null;
      }
      // no need to synchronize here.
      scannerReadPoints.remove(this);
      this.filterClosed = true;
    }

    KeyValueHeap getStoreHeapForTesting() {
      return storeHeap;
    }

    @Override
    public synchronized boolean reseek(byte[] row) throws IOException {
      if (row == null) {
        throw new IllegalArgumentException("Row cannot be null.");
      }
      boolean result = false;
      startRegionOperation();
      Cell kv = PrivateCellUtil.createFirstOnRow(row, 0, (short) row.length);
      try {
        // use request seek to make use of the lazy seek option. See HBASE-5520
        result = this.storeHeap.requestSeek(kv, true, true);
        if (this.joinedHeap != null) {
          result = this.joinedHeap.requestSeek(kv, true, true) || result;
        }
      } finally {
        closeRegionOperation();
      }
      return result;
    }

    @Override
    public void shipped() throws IOException {
      if (storeHeap != null) {
        storeHeap.shipped();
      }
      if (joinedHeap != null) {
        joinedHeap.shipped();
      }
    }

    @Override
    public void run() throws IOException {
      // This is the RPC callback method executed. We do the close in of the scanner in this
      // callback
      this.close();
    }
  }

  // Utility methods
  /**
   * A utility method to create new instances of HRegion based on the
   * {@link HConstants#REGION_IMPL} configuration property.
   * @param tableDir qualified path of directory where region should be located,
   * usually the table directory.
   * @param wal The WAL is the outbound log for any updates to the HRegion
   * The wal file is a logfile from the previous execution that's
   * custom-computed for this HRegion. The HRegionServer computes and sorts the
   * appropriate wal info for this HRegion. If there is a previous file
   * (implying that the HRegion has been written-to before), then read it from
   * the supplied path.
   * @param fs is the filesystem.
   * @param conf is global configuration settings.
   * @param regionInfo - RegionInfo that describes the region
   * is new), then read them from the supplied path.
   * @param htd the table descriptor
   * @return the new instance
   */
  public static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs,
      Configuration conf, RegionInfo regionInfo, final TableDescriptor htd,
      RegionServerServices rsServices) {
    try {
      @SuppressWarnings("unchecked")
      Class regionClass =
          (Class) conf.getClass(HConstants.REGION_IMPL, HRegion.class);

      Constructor c =
          regionClass.getConstructor(Path.class, WAL.class, FileSystem.class,
              Configuration.class, RegionInfo.class, TableDescriptor.class,
              RegionServerServices.class);

      return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices);
    } catch (Throwable e) {
      // todo: what should I throw here?
      throw new IllegalStateException("Could not instantiate a region instance.", e);
    }
  }

  /**
   * Convenience method creating new HRegions. Used by createTable.
   *
   * @param info Info for region to create.
   * @param rootDir Root directory for HBase instance
   * @param wal shared WAL
   * @param initialize - true to initialize the region
   * @return new HRegion
   */
  public static HRegion createHRegion(final RegionInfo info, final Path rootDir,
    final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal,
    final boolean initialize) throws IOException {
    LOG.info("creating " + info + ", tableDescriptor=" +
      (hTableDescriptor == null ? "null" : hTableDescriptor) + ", regionDir=" + rootDir);
    createRegionDir(conf, info, rootDir);
    FileSystem fs = rootDir.getFileSystem(conf);
    Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable());
    HRegion region = HRegion.newHRegion(tableDir, wal, fs, conf, info, hTableDescriptor, null);
    if (initialize) {
      region.initialize(null);
    }
    return region;
  }

  /**
   * Create a region under the given table directory.
   */
  public static HRegion createHRegion(Configuration conf, RegionInfo regionInfo, FileSystem fs,
    Path tableDir, TableDescriptor tableDesc) throws IOException {
    LOG.info("Creating {}, tableDescriptor={}, under table dir {}", regionInfo, tableDesc,
      tableDir);
    HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, regionInfo);
    HRegion region = HRegion.newHRegion(tableDir, null, fs, conf, regionInfo, tableDesc, null);
    return region;
  }

  /**
   * Create the region directory in the filesystem.
   */
  public static HRegionFileSystem createRegionDir(Configuration configuration, RegionInfo ri,
        Path rootDir)
      throws IOException {
    FileSystem fs = rootDir.getFileSystem(configuration);
    Path tableDir = CommonFSUtils.getTableDir(rootDir, ri.getTable());
    // If directory already exists, will log warning and keep going. Will try to create
    // .regioninfo. If one exists, will overwrite.
    return HRegionFileSystem.createRegionOnFileSystem(configuration, fs, tableDir, ri);
  }

  public static HRegion createHRegion(final RegionInfo info, final Path rootDir,
                                      final Configuration conf,
                                      final TableDescriptor hTableDescriptor,
                                      final WAL wal)
    throws IOException {
    return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true);
  }


  /**
   * Open a Region.
   * @param info Info for region to be opened.
   * @param wal WAL for region to use. This method will call
   * WAL#setSequenceNumber(long) passing the result of the call to
   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
   * up.  HRegionStore does this every time it opens a new region.
   * @return new HRegion
   *
   * @throws IOException
   */
  public static HRegion openHRegion(final RegionInfo info,
      final TableDescriptor htd, final WAL wal,
      final Configuration conf)
  throws IOException {
    return openHRegion(info, htd, wal, conf, null, null);
  }

  /**
   * Open a Region.
   * @param info Info for region to be opened
   * @param htd the table descriptor
   * @param wal WAL for region to use. This method will call
   * WAL#setSequenceNumber(long) passing the result of the call to
   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
   * up.  HRegionStore does this every time it opens a new region.
   * @param conf The Configuration object to use.
   * @param rsServices An interface we can request flushes against.
   * @param reporter An interface we can report progress against.
   * @return new HRegion
   *
   * @throws IOException
   */
  public static HRegion openHRegion(final RegionInfo info,
    final TableDescriptor htd, final WAL wal, final Configuration conf,
    final RegionServerServices rsServices,
    final CancelableProgressable reporter)
  throws IOException {
    return openHRegion(CommonFSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter);
  }

  /**
   * Open a Region.
   * @param rootDir Root directory for HBase instance
   * @param info Info for region to be opened.
   * @param htd the table descriptor
   * @param wal WAL for region to use. This method will call
   * WAL#setSequenceNumber(long) passing the result of the call to
   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
   * up.  HRegionStore does this every time it opens a new region.
   * @param conf The Configuration object to use.
   * @return new HRegion
   * @throws IOException
   */
  public static HRegion openHRegion(Path rootDir, final RegionInfo info,
      final TableDescriptor htd, final WAL wal, final Configuration conf)
  throws IOException {
    return openHRegion(rootDir, info, htd, wal, conf, null, null);
  }

  /**
   * Open a Region.
   * @param rootDir Root directory for HBase instance
   * @param info Info for region to be opened.
   * @param htd the table descriptor
   * @param wal WAL for region to use. This method will call
   * WAL#setSequenceNumber(long) passing the result of the call to
   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
   * up.  HRegionStore does this every time it opens a new region.
   * @param conf The Configuration object to use.
   * @param rsServices An interface we can request flushes against.
   * @param reporter An interface we can report progress against.
   * @return new HRegion
   * @throws IOException
   */
  public static HRegion openHRegion(final Path rootDir, final RegionInfo info,
      final TableDescriptor htd, final WAL wal, final Configuration conf,
      final RegionServerServices rsServices,
      final CancelableProgressable reporter)
  throws IOException {
    FileSystem fs = null;
    if (rsServices != null) {
      fs = rsServices.getFileSystem();
    }
    if (fs == null) {
      fs = rootDir.getFileSystem(conf);
    }
    return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter);
  }

  /**
   * Open a Region.
   * @param conf The Configuration object to use.
   * @param fs Filesystem to use
   * @param rootDir Root directory for HBase instance
   * @param info Info for region to be opened.
   * @param htd the table descriptor
   * @param wal WAL for region to use. This method will call
   * WAL#setSequenceNumber(long) passing the result of the call to
   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
   * up.  HRegionStore does this every time it opens a new region.
   * @return new HRegion
   */
  public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
      final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal)
      throws IOException {
    return openHRegion(conf, fs, rootDir, info, htd, wal, null, null);
  }

  /**
   * Open a Region.
   * @param conf The Configuration object to use.
   * @param fs Filesystem to use
   * @param rootDir Root directory for HBase instance
   * @param info Info for region to be opened.
   * @param htd the table descriptor
   * @param wal WAL for region to use. This method will call
   * WAL#setSequenceNumber(long) passing the result of the call to
   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
   * up.  HRegionStore does this every time it opens a new region.
   * @param rsServices An interface we can request flushes against.
   * @param reporter An interface we can report progress against.
   * @return new HRegion
   */
  public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
    final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal,
    final RegionServerServices rsServices, final CancelableProgressable reporter)
    throws IOException {
    Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable());
    return openHRegionFromTableDir(conf, fs, tableDir, info, htd, wal, rsServices, reporter);
  }

  /**
   * Open a Region.
   * @param conf The Configuration object to use.
   * @param fs Filesystem to use
   * @param info Info for region to be opened.
   * @param htd the table descriptor
   * @param wal WAL for region to use. This method will call
   * WAL#setSequenceNumber(long) passing the result of the call to
   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
   * up.  HRegionStore does this every time it opens a new region.
   * @param rsServices An interface we can request flushes against.
   * @param reporter An interface we can report progress against.
   * @return new HRegion
   */
  public static HRegion openHRegionFromTableDir(final Configuration conf, final FileSystem fs,
    final Path tableDir, final RegionInfo info, final TableDescriptor htd, final WAL wal,
    final RegionServerServices rsServices, final CancelableProgressable reporter)
    throws IOException {
    Objects.requireNonNull(info, "RegionInfo cannot be null");
    LOG.debug("Opening region: {}", info);
    HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
    return r.openHRegion(reporter);
  }

  @VisibleForTesting
  public NavigableMap getReplicationScope() {
    return this.replicationScope;
  }

  /**
   * Useful when reopening a closed region (normally for unit tests)
   * @param other original object
   * @param reporter An interface we can report progress against.
   * @return new HRegion
   */
  public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter)
      throws IOException {
    HRegionFileSystem regionFs = other.getRegionFileSystem();
    HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(),
        other.baseConf, other.getRegionInfo(), other.getTableDescriptor(), null);
    return r.openHRegion(reporter);
  }

  public static Region openHRegion(final Region other, final CancelableProgressable reporter)
        throws IOException {
    return openHRegion((HRegion)other, reporter);
  }

  /**
   * Open HRegion.
   * Calls initialize and sets sequenceId.
   * @return Returns this
   */
  protected HRegion openHRegion(final CancelableProgressable reporter)
  throws IOException {
    try {
      // Refuse to open the region if we are missing local compression support
      TableDescriptorChecker.checkCompression(htableDescriptor);
      // Refuse to open the region if encryption configuration is incorrect or
      // codec support is missing
      TableDescriptorChecker.checkEncryption(conf, htableDescriptor);
      // Refuse to open the region if a required class cannot be loaded
      TableDescriptorChecker.checkClassLoading(conf, htableDescriptor);
      this.openSeqNum = initialize(reporter);
      this.mvcc.advanceTo(openSeqNum);
      // The openSeqNum must be increased every time when a region is assigned, as we rely on it to
      // determine whether a region has been successfully reopened. So here we always write open
      // marker, even if the table is read only.
      if (wal != null && getRegionServerServices() != null &&
        RegionReplicaUtil.isDefaultReplica(getRegionInfo())) {
        writeRegionOpenMarker(wal, openSeqNum);
      }
    } catch (Throwable t) {
      // By coprocessor path wrong region will open failed,
      // MetricsRegionWrapperImpl is already init and not close,
      // add region close when open failed
      try {
        // It is not required to write sequence id file when region open is failed.
        // Passing true to skip the sequence id file write.
        this.close(true);
      } catch (Throwable e) {
        LOG.warn("Open region: {} failed. Try close region but got exception ", this.getRegionInfo(),
          e);
      }
      throw t;
    }
    return this;
  }

  /**
   * Open a Region on a read-only file-system (like hdfs snapshots)
   * @param conf The Configuration object to use.
   * @param fs Filesystem to use
   * @param info Info for region to be opened.
   * @param htd the table descriptor
   * @return new HRegion
   */
  public static HRegion openReadOnlyFileSystemHRegion(final Configuration conf, final FileSystem fs,
      final Path tableDir, RegionInfo info, final TableDescriptor htd) throws IOException {
    if (info == null) {
      throw new NullPointerException("Passed region info is null");
    }
    if (LOG.isDebugEnabled()) {
      LOG.debug("Opening region (readOnly filesystem): " + info);
    }
    if (info.getReplicaId() <= 0) {
      info = RegionInfoBuilder.newBuilder(info).setReplicaId(1).build();
    }
    HRegion r = HRegion.newHRegion(tableDir, null, fs, conf, info, htd, null);
    r.writestate.setReadOnly(true);
    return r.openHRegion(null);
  }

  public static void warmupHRegion(final RegionInfo info,
      final TableDescriptor htd, final WAL wal, final Configuration conf,
      final RegionServerServices rsServices,
      final CancelableProgressable reporter)
      throws IOException {

    if (info == null) throw new NullPointerException("Passed region info is null");

    if (LOG.isDebugEnabled()) {
      LOG.debug("HRegion.Warming up region: " + info);
    }

    Path rootDir = CommonFSUtils.getRootDir(conf);
    Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable());

    FileSystem fs = null;
    if (rsServices != null) {
      fs = rsServices.getFileSystem();
    }
    if (fs == null) {
      fs = rootDir.getFileSystem(conf);
    }

    HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, null);
    r.initializeWarmup(reporter);
  }

  /**
   * Computes the Path of the HRegion
   *
   * @param tabledir qualified path for table
   * @param name ENCODED region name
   * @return Path of HRegion directory
   * @deprecated For tests only; to be removed.
   */
  @Deprecated
  public static Path getRegionDir(final Path tabledir, final String name) {
    return new Path(tabledir, name);
  }

  /**
   * Determines if the specified row is within the row range specified by the
   * specified RegionInfo
   *
   * @param info RegionInfo that specifies the row range
   * @param row row to be checked
   * @return true if the row is within the range specified by the RegionInfo
   */
  public static boolean rowIsInRange(RegionInfo info, final byte [] row) {
    return ((info.getStartKey().length == 0) ||
        (Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
        ((info.getEndKey().length == 0) ||
            (Bytes.compareTo(info.getEndKey(), row) > 0));
  }

  public static boolean rowIsInRange(RegionInfo info, final byte [] row, final int offset,
      final short length) {
    return ((info.getStartKey().length == 0) ||
        (Bytes.compareTo(info.getStartKey(), 0, info.getStartKey().length,
          row, offset, length) <= 0)) &&
        ((info.getEndKey().length == 0) ||
          (Bytes.compareTo(info.getEndKey(), 0, info.getEndKey().length, row, offset, length) > 0));
  }

  @Override
  public Result get(final Get get) throws IOException {
    prepareGet(get);
    List results = get(get, true);
    boolean stale = this.getRegionInfo().getReplicaId() != 0;
    return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale);
  }

  void prepareGet(final Get get) throws IOException {
    checkRow(get.getRow(), "Get");
    // Verify families are all valid
    if (get.hasFamilies()) {
      for (byte[] family : get.familySet()) {
        checkFamily(family);
      }
    } else { // Adding all families to scanner
      for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) {
        get.addFamily(family);
      }
    }
  }

  @Override
  public List get(Get get, boolean withCoprocessor) throws IOException {
    return get(get, withCoprocessor, HConstants.NO_NONCE, HConstants.NO_NONCE);
  }

  public List get(Get get, boolean withCoprocessor, long nonceGroup, long nonce)
      throws IOException {
    List results = new ArrayList<>();
    long before =  EnvironmentEdgeManager.currentTime();

    // pre-get CP hook
    if (withCoprocessor && (coprocessorHost != null)) {
      if (coprocessorHost.preGet(get, results)) {
        metricsUpdateForGet(results, before);
        return results;
      }
    }
    Scan scan = new Scan(get);
    if (scan.getLoadColumnFamiliesOnDemandValue() == null) {
      scan.setLoadColumnFamiliesOnDemand(isLoadingCfsOnDemandDefault());
    }
    RegionScanner scanner = null;
    try {
      scanner = getScanner(scan, null, nonceGroup, nonce);
      scanner.next(results);
    } finally {
      if (scanner != null)
        scanner.close();
    }

    // post-get CP hook
    if (withCoprocessor && (coprocessorHost != null)) {
      coprocessorHost.postGet(get, results);
    }

    metricsUpdateForGet(results, before);

    return results;
  }

  void metricsUpdateForGet(List results, long before) {
    if (this.metricsRegion != null) {
      this.metricsRegion.updateGet(EnvironmentEdgeManager.currentTime() - before);
    }
    if (rsServices != null && this.rsServices.getMetrics() != null) {
      rsServices.getMetrics().updateReadQueryMeter(getTableDescriptor().getTableName(), 1);
    }
  }

  @Override
  public void mutateRow(RowMutations rm) throws IOException {
    // Don't need nonces here - RowMutations only supports puts and deletes
    final List m = rm.getMutations();
    batchMutate(m.toArray(new Mutation[m.size()]), true, HConstants.NO_NONCE,
        HConstants.NO_NONCE);
  }

  /**
   * Perform atomic (all or none) mutations within the region.
   * @param mutations The list of mutations to perform.
   * mutations can contain operations for multiple rows.
   * Caller has to ensure that all rows are contained in this region.
   * @param rowsToLock Rows to lock
   * @param nonceGroup Optional nonce group of the operation (client Id)
   * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
   * If multiple rows are locked care should be taken that
   * rowsToLock is sorted in order to avoid deadlocks.
   * @throws IOException
   */
  @Override
  public void mutateRowsWithLocks(Collection mutations,
      Collection rowsToLock, long nonceGroup, long nonce) throws IOException {
    batchMutate(new MutationBatchOperation(this, mutations.toArray(new Mutation[mutations.size()]),
        true, nonceGroup, nonce) {
      @Override
      public MiniBatchOperationInProgress lockRowsAndBuildMiniBatch(
          List acquiredRowLocks) throws IOException {
        RowLock prevRowLock = null;
        for (byte[] row : rowsToLock) {
          try {
            RowLock rowLock = region.getRowLockInternal(row, false, prevRowLock); // write lock
            if (rowLock != prevRowLock) {
              acquiredRowLocks.add(rowLock);
              prevRowLock = rowLock;
            }
          } catch (IOException ioe) {
            LOG.warn("Failed getting lock, row={}, in region {}", Bytes.toStringBinary(row), this,
              ioe);
            throw ioe;
          }
        }
        return createMiniBatch(size(), size());
      }
    });
  }

  /**
   * @return statistics about the current load of the region
   */
  public ClientProtos.RegionLoadStats getLoadStatistics() {
    if (!regionStatsEnabled) {
      return null;
    }
    ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder();
    stats.setMemStoreLoad((int) (Math.min(100,
        (this.memStoreSizing.getMemStoreSize().getHeapSize() * 100) / this.memstoreFlushSize)));
    if (rsServices.getHeapMemoryManager() != null) {
      // the HeapMemoryManager uses -0.0 to signal a problem asking the JVM,
      // so we could just do the calculation below and we'll get a 0.
      // treating it as a special case analogous to no HMM instead so that it can be
      // programatically treated different from using <1% of heap.
      final float occupancy = rsServices.getHeapMemoryManager().getHeapOccupancyPercent();
      if (occupancy != HeapMemoryManager.HEAP_OCCUPANCY_ERROR_VALUE) {
        stats.setHeapOccupancy((int)(occupancy * 100));
      }
    }
    stats.setCompactionPressure((int) (rsServices.getCompactionPressure() * 100 > 100 ? 100
        : rsServices.getCompactionPressure() * 100));
    return stats.build();
  }

  @Override
  public void processRowsWithLocks(RowProcessor processor) throws IOException {
    processRowsWithLocks(processor, rowProcessorTimeout, HConstants.NO_NONCE, HConstants.NO_NONCE);
  }

  @Override
  public void processRowsWithLocks(RowProcessor processor, long nonceGroup, long nonce)
      throws IOException {
    processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce);
  }

  @Override
  public void processRowsWithLocks(RowProcessor processor, long timeout,
      long nonceGroup, long nonce) throws IOException {
    for (byte[] row : processor.getRowsToLock()) {
      checkRow(row, "processRowsWithLocks");
    }
    if (!processor.readOnly()) {
      checkReadOnly();
    }
    checkResources();
    startRegionOperation();
    WALEdit walEdit = new WALEdit();

    // STEP 1. Run pre-process hook
    preProcess(processor, walEdit);
    // Short circuit the read only case
    if (processor.readOnly()) {
      try {
        long now = EnvironmentEdgeManager.currentTime();
        doProcessRowWithTimeout(processor, now, this, null, null, timeout);
        processor.postProcess(this, walEdit, true);
      } finally {
        closeRegionOperation();
      }
      return;
    }

    boolean locked = false;
    List acquiredRowLocks = null;
    List mutations = new ArrayList<>();
    Collection rowsToLock = processor.getRowsToLock();
    // This is assigned by mvcc either explicity in the below or in the guts of the WAL append
    // when it assigns the edit a sequencedid (A.K.A the mvcc write number).
    WriteEntry writeEntry = null;
    MemStoreSizing memstoreAccounting = new NonThreadSafeMemStoreSizing();
    try {
      boolean success = false;
      try {
        // STEP 2. Acquire the row lock(s)
        acquiredRowLocks = new ArrayList<>(rowsToLock.size());
        RowLock prevRowLock = null;
        for (byte[] row : rowsToLock) {
          // Attempt to lock all involved rows, throw if any lock times out
          // use a writer lock for mixed reads and writes
          RowLock rowLock = getRowLockInternal(row, false, prevRowLock);
          if (rowLock != prevRowLock) {
            acquiredRowLocks.add(rowLock);
            prevRowLock = rowLock;
          }
        }
        // STEP 3. Region lock
        lock(this.updatesLock.readLock(), acquiredRowLocks.isEmpty() ? 1 : acquiredRowLocks.size());
        locked = true;
        long now = EnvironmentEdgeManager.currentTime();
        // STEP 4. Let the processor scan the rows, generate mutations and add waledits
        doProcessRowWithTimeout(processor, now, this, mutations, walEdit, timeout);
        if (!mutations.isEmpty()) {
          writeRequestsCount.add(mutations.size());
          // STEP 5. Call the preBatchMutate hook
          processor.preBatchMutate(this, walEdit);

          // STEP 6. Append and sync if walEdit has data to write out.
          if (!walEdit.isEmpty()) {
            writeEntry = doWALAppend(walEdit, getEffectiveDurability(processor.useDurability()),
                processor.getClusterIds(), now, nonceGroup, nonce);
          } else {
            // We are here if WAL is being skipped.
            writeEntry = this.mvcc.begin();
          }

          // STEP 7. Apply to memstore
          long sequenceId = writeEntry.getWriteNumber();
          for (Mutation m : mutations) {
            // Handle any tag based cell features.
            // TODO: Do we need to call rewriteCellTags down in applyToMemStore()? Why not before
            // so tags go into WAL?
            rewriteCellTags(m.getFamilyCellMap(), m);
            for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
              Cell cell = cellScanner.current();
              if (walEdit.isEmpty()) {
                // If walEdit is empty, we put nothing in WAL. WAL stamps Cells with sequence id.
                // If no WAL, need to stamp it here.
                PrivateCellUtil.setSequenceId(cell, sequenceId);
              }
              applyToMemStore(getStore(cell), cell, memstoreAccounting);
            }
          }

          // STEP 8. call postBatchMutate hook
          processor.postBatchMutate(this);

          // STEP 9. Complete mvcc.
          mvcc.completeAndWait(writeEntry);
          writeEntry = null;

          // STEP 10. Release region lock
          if (locked) {
            this.updatesLock.readLock().unlock();
            locked = false;
          }

          // STEP 11. Release row lock(s)
          releaseRowLocks(acquiredRowLocks);

          if (rsServices != null && rsServices.getMetrics() != null) {
            rsServices.getMetrics().updateWriteQueryMeter(this.htableDescriptor.
              getTableName(), mutations.size());
          }
        }
        success = true;
      } finally {
        // Call complete rather than completeAndWait because we probably had error if walKey != null
        if (writeEntry != null) mvcc.complete(writeEntry);
        if (locked) {
          this.updatesLock.readLock().unlock();
        }
        // release locks if some were acquired but another timed out
        releaseRowLocks(acquiredRowLocks);
      }

      // 12. Run post-process hook
      processor.postProcess(this, walEdit, success);
    } finally {
      closeRegionOperation();
      if (!mutations.isEmpty()) {
        this.incMemStoreSize(memstoreAccounting.getMemStoreSize());
        requestFlushIfNeeded();
      }
    }
  }

  private void preProcess(final RowProcessor processor, final WALEdit walEdit)
  throws IOException {
    try {
      processor.preProcess(this, walEdit);
    } catch (IOException e) {
      closeRegionOperation();
      throw e;
    }
  }

  private void doProcessRowWithTimeout(final RowProcessor processor,
                                       final long now,
                                       final HRegion region,
                                       final List mutations,
                                       final WALEdit walEdit,
                                       final long timeout) throws IOException {
    // Short circuit the no time bound case.
    if (timeout < 0) {
      try {
        processor.process(now, region, mutations, walEdit);
      } catch (IOException e) {
        String row = processor.getRowsToLock().isEmpty() ? "" :
          " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "...";
        LOG.warn("RowProcessor: {}, in region {}, throws Exception {}",
          processor.getClass().getName(), getRegionInfo().getRegionNameAsString(), row, e);
        throw e;
      }
      return;
    }

    // Case with time bound
    FutureTask task = new FutureTask<>(new Callable() {
        @Override
        public Void call() throws IOException {
          try {
            processor.process(now, region, mutations, walEdit);
            return null;
          } catch (IOException e) {
            String row = processor.getRowsToLock().isEmpty() ? "" :
              " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "...";
            LOG.warn("RowProcessor: {}, in region {}, throws Exception {}",
              processor.getClass().getName(), getRegionInfo().getRegionNameAsString(), row, e);
            throw e;
          }
        }
      });
    rowProcessorExecutor.execute(task);
    try {
      task.get(timeout, TimeUnit.MILLISECONDS);
    } catch (TimeoutException te) {
      String row = processor.getRowsToLock().isEmpty() ? "" :
        " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "...";
      LOG.error("RowProcessor timeout: {} ms, in region {}, {}", timeout,
        getRegionInfo().getRegionNameAsString(), row);
      throw new IOException(te);
    } catch (Exception e) {
      throw new IOException(e);
    }
  }

  @Override
  public Result append(Append append) throws IOException {
    return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE);
  }

  public Result append(Append mutation, long nonceGroup, long nonce) throws IOException {
    return doDelta(Operation.APPEND, mutation, nonceGroup, nonce, mutation.isReturnResults());
  }

  @Override
  public Result increment(Increment increment) throws IOException {
    return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE);
  }

  public Result increment(Increment mutation, long nonceGroup, long nonce) throws IOException {
    return doDelta(Operation.INCREMENT, mutation, nonceGroup, nonce, mutation.isReturnResults());
  }

  /**
   * Add "deltas" to Cells. Deltas are increments or appends. Switch on op.
   *
   * 
If increment, add deltas to current values or if an append, then
   * append the deltas to the current Cell values.
   *
   * 
Append and Increment code paths are mostly the same. They differ in just a few places.
   * This method does the code path for increment and append and then in key spots, switches
   * on the passed in op to do increment or append specific paths.
   */
  private Result doDelta(Operation op, Mutation mutation, long nonceGroup, long nonce,
      boolean returnResults) throws IOException {
    checkReadOnly();
    checkResources();
    checkRow(mutation.getRow(), op.toString());
    checkFamilies(mutation.getFamilyCellMap().keySet());
    this.writeRequestsCount.increment();
    WriteEntry writeEntry = null;
    startRegionOperation(op);
    List results = returnResults? new ArrayList<>(mutation.size()): null;
    RowLock rowLock = null;
    MemStoreSizing memstoreAccounting = new NonThreadSafeMemStoreSizing();
    try {
      rowLock = getRowLockInternal(mutation.getRow(), false, null);
      lock(this.updatesLock.readLock());
      try {
        Result cpResult = doCoprocessorPreCall(op, mutation);
        if (cpResult != null) {
          // Metrics updated below in the finally block.
          return returnResults? cpResult: null;
        }
        Durability effectiveDurability = getEffectiveDurability(mutation.getDurability());
        Map> forMemStore = new HashMap<>(mutation.getFamilyCellMap().size());
        // Reckon Cells to apply to WAL --  in returned walEdit -- and what to add to memstore and
        // what to return back to the client (in 'forMemStore' and 'results' respectively).
        WALEdit walEdit = reckonDeltas(op, mutation, effectiveDurability, forMemStore, results);
        // Actually write to WAL now if a walEdit to apply.
        if (walEdit != null && !walEdit.isEmpty()) {
          writeEntry = doWALAppend(walEdit, effectiveDurability, nonceGroup, nonce);
        } else {
          // If walEdits is empty, it means we skipped the WAL; update LongAdders and start an mvcc
          // transaction.
          recordMutationWithoutWal(mutation.getFamilyCellMap());
          writeEntry = mvcc.begin();
          updateSequenceId(forMemStore.values(), writeEntry.getWriteNumber());
        }
        // Now write to MemStore. Do it a column family at a time.
        for (Map.Entry> e : forMemStore.entrySet()) {
          applyToMemStore(e.getKey(), e.getValue(), true, memstoreAccounting);
        }
        mvcc.completeAndWait(writeEntry);
        if (rsServices != null && rsServices.getNonceManager() != null) {
          rsServices.getNonceManager().addMvccToOperationContext(nonceGroup, nonce,
            writeEntry.getWriteNumber());
        }
        if (rsServices != null && rsServices.getMetrics() != null) {
          rsServices.getMetrics().updateWriteQueryMeter(this.htableDescriptor.
            getTableName());
        }
        writeEntry = null;
      } finally {
        this.updatesLock.readLock().unlock();
      }
      // If results is null, then client asked that we not return the calculated results.
      return results != null && returnResults? Result.create(results): Result.EMPTY_RESULT;
    } finally {
      // Call complete always, even on success. doDelta is doing a Get READ_UNCOMMITTED when it goes
      // to get current value under an exclusive lock so no need so no need to wait to return to
      // the client. Means only way to read-your-own-increment or append is to come in with an
      // a 0 increment.
      if (writeEntry != null) mvcc.complete(writeEntry);
      if (rowLock != null) {
        rowLock.release();
      }
      // Request a cache flush if over the limit.  Do it outside update lock.
      incMemStoreSize(memstoreAccounting.getMemStoreSize());
      requestFlushIfNeeded();
      closeRegionOperation(op);
      if (this.metricsRegion != null) {
        switch (op) {
          case INCREMENT:
            this.metricsRegion.updateIncrement();
            break;
          case APPEND:
            this.metricsRegion.updateAppend();
            break;
          default:
            break;
        }
      }
    }
  }

  private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, long nonceGroup,
      long nonce)
  throws IOException {
    return doWALAppend(walEdit, durability, WALKey.EMPTY_UUIDS, System.currentTimeMillis(),
      nonceGroup, nonce);
  }

  private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List clusterIds,
      long now, long nonceGroup, long nonce) throws IOException {
    return doWALAppend(walEdit, durability, clusterIds, now, nonceGroup, nonce,
        SequenceId.NO_SEQUENCE_ID);
  }

  /**
   * @return writeEntry associated with this append
   */
  private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List clusterIds,
      long now, long nonceGroup, long nonce, long origLogSeqNum) throws IOException {
    Preconditions.checkArgument(walEdit != null && !walEdit.isEmpty(),
        "WALEdit is null or empty!");
    Preconditions.checkArgument(!walEdit.isReplay() || origLogSeqNum != SequenceId.NO_SEQUENCE_ID,
        "Invalid replay sequence Id for replay WALEdit!");
    // Using default cluster id, as this can only happen in the originating cluster.
    // A slave cluster receives the final value (not the delta) as a Put. We use HLogKey
    // here instead of WALKeyImpl directly to support legacy coprocessors.
    WALKeyImpl walKey = walEdit.isReplay()?
        new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(),
          this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds,
            nonceGroup, nonce, mvcc) :
        new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(),
            this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds,
            nonceGroup, nonce, mvcc, this.getReplicationScope());
    if (walEdit.isReplay()) {
      walKey.setOrigLogSeqNum(origLogSeqNum);
    }
    //don't call the coproc hook for writes to the WAL caused by
    //system lifecycle events like flushes or compactions
    if (this.coprocessorHost != null && !walEdit.isMetaEdit()) {
      this.coprocessorHost.preWALAppend(walKey, walEdit);
    }
    WriteEntry writeEntry = null;
    try {
      long txid = this.wal.appendData(this.getRegionInfo(), walKey, walEdit);
      // Call sync on our edit.
      if (txid != 0) {
        sync(txid, durability);
      }
      writeEntry = walKey.getWriteEntry();
    } catch (IOException ioe) {
      if (walKey != null && walKey.getWriteEntry() != null) {
        mvcc.complete(walKey.getWriteEntry());
      }
      throw ioe;
    }
    return writeEntry;
  }

  /**
   * Do coprocessor pre-increment or pre-append call.
   * @return Result returned out of the coprocessor, which means bypass all further processing and
   *  return the proffered Result instead, or null which means proceed.
   */
  private Result doCoprocessorPreCall(final Operation op, final Mutation mutation)
  throws IOException {
    Result result = null;
    if (this.coprocessorHost != null) {
      switch(op) {
        case INCREMENT:
          result = this.coprocessorHost.preIncrementAfterRowLock((Increment)mutation);
          break;
        case APPEND:
          result = this.coprocessorHost.preAppendAfterRowLock((Append)mutation);
          break;
        default: throw new UnsupportedOperationException(op.toString());
      }
    }
    return result;
  }

  /**
   * Reckon the Cells to apply to WAL, memstore, and to return to the Client; these Sets are not
   * always the same dependent on whether to write WAL.
   *
   * @param results Fill in here what goes back to the Client if it is non-null (if null, client
   *  doesn't want results).
   * @param forMemStore Fill in here what to apply to the MemStore (by Store).
   * @return A WALEdit to apply to WAL or null if we are to skip the WAL.
   */
  private WALEdit reckonDeltas(Operation op, Mutation mutation, Durability effectiveDurability,
      Map> forMemStore, List results) throws IOException {
    WALEdit walEdit = null;
    long now = EnvironmentEdgeManager.currentTime();
    final boolean writeToWAL = effectiveDurability != Durability.SKIP_WAL;
    // Process a Store/family at a time.
    for (Map.Entry> entry: mutation.getFamilyCellMap().entrySet()) {
      final byte[] columnFamilyName = entry.getKey();
      List deltas = entry.getValue();
      // Reckon for the Store what to apply to WAL and MemStore.
      List toApply = reckonDeltasByStore(stores.get(columnFamilyName), op, mutation,
        effectiveDurability, now, deltas, results);
      if (!toApply.isEmpty()) {
        for (Cell cell : toApply) {
          HStore store = getStore(cell);
          if (store == null) {
            checkFamily(CellUtil.cloneFamily(cell));
          } else {
            forMemStore.computeIfAbsent(store, key -> new ArrayList<>()).add(cell);
          }
        }
        if (writeToWAL) {
          if (walEdit == null) {
            walEdit = new WALEdit();
          }
          walEdit.getCells().addAll(toApply);
        }
      }
    }
    return walEdit;
  }

  /**
   * Reckon the Cells to apply to WAL, memstore, and to return to the Client in passed
   * column family/Store.
   *
   * Does Get of current value and then adds passed in deltas for this Store returning the result.
   *
   * @param op Whether Increment or Append
   * @param mutation The encompassing Mutation object
   * @param deltas Changes to apply to this Store; either increment amount or data to append
   * @param results In here we accumulate all the Cells we are to return to the client. If null,
   *                client doesn't want results returned.
   * @return Resulting Cells after deltas have been applied to current
   *  values. Side effect is our filling out of the results List.
   */
  private List reckonDeltasByStore(HStore store, Operation op, Mutation mutation,
      Durability effectiveDurability, long now, List deltas, List results)
      throws IOException {
    byte[] columnFamily = store.getColumnFamilyDescriptor().getName();
    List> cellPairs = new ArrayList<>(deltas.size());
    // Get previous values for all columns in this family.
    TimeRange tr = null;
    switch (op) {
      case INCREMENT:
        tr = ((Increment)mutation).getTimeRange();
        break;
      case APPEND:
        tr = ((Append)mutation).getTimeRange();
        break;
      default:
        break;
    }
    List currentValues = get(mutation, store, deltas,null, tr);
    // Iterate the input columns and update existing values if they were found, otherwise
    // add new column initialized to the delta amount
    int currentValuesIndex = 0;
    for (int i = 0; i < deltas.size(); i++) {
      Cell delta = deltas.get(i);
      Cell currentValue = null;
      if (currentValuesIndex < currentValues.size() &&
          CellUtil.matchingQualifier(currentValues.get(currentValuesIndex), delta)) {
        currentValue = currentValues.get(currentValuesIndex);
        if (i < (deltas.size() - 1) && !CellUtil.matchingQualifier(delta, deltas.get(i + 1))) {
          currentValuesIndex++;
        }
      }

      // Switch on whether this an increment or an append building the new Cell to apply.
      Cell newCell = null;
      switch (op) {
        case INCREMENT:
          long deltaAmount = getLongValue(delta);
          final long newValue = currentValue == null ? deltaAmount : getLongValue(currentValue) + deltaAmount;
          newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, (oldCell) -> Bytes.toBytes(newValue));
          break;
        case APPEND:
          newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, (oldCell) ->
            ByteBuffer.wrap(new byte[delta.getValueLength() + oldCell.getValueLength()])
                    .put(oldCell.getValueArray(), oldCell.getValueOffset(), oldCell.getValueLength())
                    .put(delta.getValueArray(), delta.getValueOffset(), delta.getValueLength())
                    .array()
          );
          break;
        default: throw new UnsupportedOperationException(op.toString());
      }
      if (this.maxCellSize > 0) {
        int newCellSize = PrivateCellUtil.estimatedSerializedSizeOf(newCell);
        if (newCellSize > this.maxCellSize) {
          String msg = "Cell with size " + newCellSize + " exceeds limit of " + this.maxCellSize
            + " bytes in region " + this;
          LOG.debug(msg);
          throw new DoNotRetryIOException(msg);
        }
      }
      cellPairs.add(new Pair<>(currentValue, newCell));
      // Add to results to get returned to the Client. If null, cilent does not want results.
      if (results != null) {
        results.add(newCell);
      }
    }

    // Give coprocessors a chance to update the new cells before apply to WAL or memstore
    if (coprocessorHost != null) {
      // Here the operation must be increment or append.
      cellPairs = op == Operation.INCREMENT ?
          coprocessorHost.postIncrementBeforeWAL(mutation, cellPairs) :
          coprocessorHost.postAppendBeforeWAL(mutation, cellPairs);
    }
    return cellPairs.stream().map(Pair::getSecond).collect(Collectors.toList());
  }

  private static Cell reckonDelta(final Cell delta, final Cell currentCell,
                                  final byte[] columnFamily, final long now,
                                  Mutation mutation, Function supplier) throws IOException {
    // Forward any tags found on the delta.
    List tags = TagUtil.carryForwardTags(delta);
    tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL());
    if (currentCell != null) {
      tags = TagUtil.carryForwardTags(tags, currentCell);
      byte[] newValue = supplier.apply(currentCell);
      return ExtendedCellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
              .setRow(mutation.getRow(), 0, mutation.getRow().length)
              .setFamily(columnFamily, 0, columnFamily.length)
              // copy the qualifier if the cell is located in shared memory.
              .setQualifier(CellUtil.cloneQualifier(delta))
              .setTimestamp(Math.max(currentCell.getTimestamp() + 1, now))
              .setType(KeyValue.Type.Put.getCode())
              .setValue(newValue, 0, newValue.length)
              .setTags(TagUtil.fromList(tags))
              .build();
    } else {
      PrivateCellUtil.updateLatestStamp(delta, now);
      return CollectionUtils.isEmpty(tags) ? delta : PrivateCellUtil.createCell(delta, tags);
    }
  }

  /**
   * @return Get the long out of the passed in Cell
   */
  private static long getLongValue(final Cell cell) throws DoNotRetryIOException {
    int len = cell.getValueLength();
    if (len != Bytes.SIZEOF_LONG) {
      // throw DoNotRetryIOException instead of IllegalArgumentException
      throw new DoNotRetryIOException("Field is not a long, it's " + len + " bytes wide");
    }
    return PrivateCellUtil.getValueAsLong(cell);
  }

  /**
   * Do a specific Get on passed columnFamily and column qualifiers.
   * @param mutation Mutation we are doing this Get for.
   * @param store Which column family on row (TODO: Go all Gets in one go)
   * @param coordinates Cells from mutation used as coordinates applied to Get.
   * @return Return list of Cells found.
   */
  private List get(Mutation mutation, HStore store, List coordinates,
      IsolationLevel isolation, TimeRange tr) throws IOException {
    // Sort the cells so that they match the order that they appear in the Get results. Otherwise,
    // we won't be able to find the existing values if the cells are not specified in order by the
    // client since cells are in an array list.
    // TODO: I don't get why we are sorting. St.Ack 20150107
    sort(coordinates, store.getComparator());
    Get get = new Get(mutation.getRow());
    if (isolation != null) {
      get.setIsolationLevel(isolation);
    }
    for (Cell cell: coordinates) {
      get.addColumn(store.getColumnFamilyDescriptor().getName(), CellUtil.cloneQualifier(cell));
    }
    // Increments carry time range. If an Increment instance, put it on the Get.
    if (tr != null) {
      get.setTimeRange(tr.getMin(), tr.getMax());
    }
    return get(get, false);
  }

  /**
   * @return Sorted list of cells using comparator
   */
  private static List sort(List cells, final CellComparator comparator) {
    cells.sort(comparator);
    return cells;
  }

  //
  // New HBASE-880 Helpers
  //

  void checkFamily(final byte [] family)
  throws NoSuchColumnFamilyException {
    if (!this.htableDescriptor.hasColumnFamily(family)) {
      throw new NoSuchColumnFamilyException("Column family " +
          Bytes.toString(family) + " does not exist in region " + this
          + " in table " + this.htableDescriptor);
    }
  }

  public static final long FIXED_OVERHEAD = ClassSize.align(
      ClassSize.OBJECT +
      ClassSize.ARRAY +
      55 * ClassSize.REFERENCE + 3 * Bytes.SIZEOF_INT +
      (15 * Bytes.SIZEOF_LONG) +
      3 * Bytes.SIZEOF_BOOLEAN);

  // woefully out of date - currently missing:
  // 1 x HashMap - coprocessorServiceHandlers
  // 6 x LongAdder - numMutationsWithoutWAL, dataInMemoryWithoutWAL,
  //   checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount,
  //   writeRequestsCount
  // 1 x HRegion$WriteState - writestate
  // 1 x RegionCoprocessorHost - coprocessorHost
  // 1 x RegionSplitPolicy - splitPolicy
  // 1 x MetricsRegion - metricsRegion
  // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper
  public static final long DEEP_OVERHEAD = FIXED_OVERHEAD +
      ClassSize.OBJECT + // closeLock
      (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
      (3 * ClassSize.ATOMIC_LONG) + // numPutsWithoutWAL, dataInMemoryWithoutWAL,
                                    // compactionsFailed
      (2 * ClassSize.CONCURRENT_HASHMAP) +  // lockedRows, scannerReadPoints
      WriteState.HEAP_SIZE + // writestate
      ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores
      (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock
      MultiVersionConcurrencyControl.FIXED_SIZE // mvcc
      + 2 * ClassSize.TREEMAP // maxSeqIdInStores, replicationScopes
      + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress
      + ClassSize.STORE_SERVICES // store services
      + StoreHotnessProtector.FIXED_SIZE
      ;

  @Override
  public long heapSize() {
    // this does not take into account row locks, recent flushes, mvcc entries, and more
    return DEEP_OVERHEAD + stores.values().stream().mapToLong(HStore::heapSize).sum();
  }

  /**
   * Registers a new protocol buffer {@link Service} subclass as a coprocessor endpoint to
   * be available for handling Region#execService(com.google.protobuf.RpcController,
   *    org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall) calls.
   *
   * 

   * Only a single instance may be registered per region for a given {@link Service} subclass (the
   * instances are keyed on {@link com.google.protobuf.Descriptors.ServiceDescriptor#getFullName()}.
   * After the first registration, subsequent calls with the same service name will fail with
   * a return value of {@code false}.
   * 
   * @param instance the {@code Service} subclass instance to expose as a coprocessor endpoint
   * @return {@code true} if the registration was successful, {@code false}
   * otherwise
   */
  public boolean registerService(com.google.protobuf.Service instance) {
    /*
     * No stacking of instances is allowed for a single service name
     */
    com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
    String serviceName = CoprocessorRpcUtils.getServiceName(serviceDesc);
    if (coprocessorServiceHandlers.containsKey(serviceName)) {
      LOG.error("Coprocessor service {} already registered, rejecting request from {} in region {}",
        serviceName, instance, this);
      return false;
    }

    coprocessorServiceHandlers.put(serviceName, instance);
    if (LOG.isDebugEnabled()) {
      LOG.debug("Registered coprocessor service: region=" +
          Bytes.toStringBinary(getRegionInfo().getRegionName()) +
          " service=" + serviceName);
    }
    return true;
  }

  /**
   * Executes a single protocol buffer coprocessor endpoint {@link Service} method using
   * the registered protocol handlers.  {@link Service} implementations must be registered via the
   * {@link #registerService(com.google.protobuf.Service)}
   * method before they are available.
   *
   * @param controller an {@code RpcContoller} implementation to pass to the invoked service
   * @param call a {@code CoprocessorServiceCall} instance identifying the service, method,
   *     and parameters for the method invocation
   * @return a protocol buffer {@code Message} instance containing the method's result
   * @throws IOException if no registered service handler is found or an error
   *     occurs during the invocation
   * @see #registerService(com.google.protobuf.Service)
   */
  public com.google.protobuf.Message execService(com.google.protobuf.RpcController controller,
      CoprocessorServiceCall call) throws IOException {
    String serviceName = call.getServiceName();
    com.google.protobuf.Service service = coprocessorServiceHandlers.get(serviceName);
    if (service == null) {
      throw new UnknownProtocolException(null, "No registered coprocessor service found for " +
          serviceName + " in region " + Bytes.toStringBinary(getRegionInfo().getRegionName()));
    }
    com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();

    String methodName = call.getMethodName();
    com.google.protobuf.Descriptors.MethodDescriptor methodDesc =
        CoprocessorRpcUtils.getMethodDescriptor(methodName, serviceDesc);

    com.google.protobuf.Message.Builder builder =
        service.getRequestPrototype(methodDesc).newBuilderForType();

    org.apache.hadoop.hbase.protobuf.ProtobufUtil.mergeFrom(builder,
        call.getRequest().toByteArray());
    com.google.protobuf.Message request =
        CoprocessorRpcUtils.getRequest(service, methodDesc, call.getRequest());

    if (coprocessorHost != null) {
      request = coprocessorHost.preEndpointInvocation(service, methodName, request);
    }

    final com.google.protobuf.Message.Builder responseBuilder =
        service.getResponsePrototype(methodDesc).newBuilderForType();
    service.callMethod(methodDesc, controller, request,
        new com.google.protobuf.RpcCallback() {
      @Override
      public void run(com.google.protobuf.Message message) {
        if (message != null) {
          responseBuilder.mergeFrom(message);
        }
      }
    });

    if (coprocessorHost != null) {
      coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder);
    }
    IOException exception =
        org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils.getControllerException(controller);
    if (exception != null) {
      throw exception;
    }

    return responseBuilder.build();
  }

  boolean shouldForceSplit() {
    return this.splitRequest;
  }

  byte[] getExplicitSplitPoint() {
    return this.explicitSplitPoint;
  }

  void forceSplit(byte[] sp) {
    // This HRegion will go away after the forced split is successful
    // But if a forced split fails, we need to clear forced split.
    this.splitRequest = true;
    if (sp != null) {
      this.explicitSplitPoint = sp;
    }
  }

  void clearSplit() {
    this.splitRequest = false;
    this.explicitSplitPoint = null;
  }

  /**
   * Return the splitpoint. null indicates the region isn't splittable
   * If the splitpoint isn't explicitly specified, it will go over the stores
   * to find the best splitpoint. Currently the criteria of best splitpoint
   * is based on the size of the store.
   */
  public byte[] checkSplit() {
    // Can't split META
    if (this.getRegionInfo().isMetaRegion() ||
        TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable())) {
      if (shouldForceSplit()) {
        LOG.warn("Cannot split meta region in HBase 0.20 and above");
      }
      return null;
    }

    // Can't split a region that is closing.
    if (this.isClosing()) {
      return null;
    }

    if (!splitPolicy.shouldSplit()) {
      return null;
    }

    byte[] ret = splitPolicy.getSplitPoint();

    if (ret != null) {
      try {
        checkRow(ret, "calculated split");
      } catch (IOException e) {
        LOG.error("Ignoring invalid split for region {}", this, e);
        return null;
      }
    }
    return ret;
  }

  /**
   * @return The priority that this region should have in the compaction queue
   */
  public int getCompactPriority() {
    return stores.values().stream().mapToInt(HStore::getCompactPriority).min()
        .orElse(Store.NO_PRIORITY);
  }

  /** @return the coprocessor host */
  public RegionCoprocessorHost getCoprocessorHost() {
    return coprocessorHost;
  }

  /** @param coprocessorHost the new coprocessor host */
  @VisibleForTesting
  public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) {
    this.coprocessorHost = coprocessorHost;
  }

  @Override
  public void startRegionOperation() throws IOException {
    startRegionOperation(Operation.ANY);
  }

  @Override
  public void startRegionOperation(Operation op) throws IOException {
    switch (op) {
      case GET:  // read operations
      case SCAN:
        checkReadsEnabled();
        break;
      default:
        break;
    }
    if (op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION
        || op == Operation.COMPACT_REGION || op == Operation.COMPACT_SWITCH) {
      // split, merge or compact region doesn't need to check the closing/closed state or lock the
      // region
      return;
    }
    if (this.closing.get()) {
      throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing");
    }
    lock(lock.readLock());
    if (this.closed.get()) {
      lock.readLock().unlock();
      throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed");
    }
    // The unit for snapshot is a region. So, all stores for this region must be
    // prepared for snapshot operation before proceeding.
    if (op == Operation.SNAPSHOT) {
      stores.values().forEach(HStore::preSnapshotOperation);
    }
    try {
      if (coprocessorHost != null) {
        coprocessorHost.postStartRegionOperation(op);
      }
    } catch (Exception e) {
      lock.readLock().unlock();
      throw new IOException(e);
    }
  }

  @Override
  public void closeRegionOperation() throws IOException {
    closeRegionOperation(Operation.ANY);
  }

  @Override
  public void closeRegionOperation(Operation operation) throws IOException {
    if (operation == Operation.SNAPSHOT) {
      stores.values().forEach(HStore::postSnapshotOperation);
    }
    lock.readLock().unlock();
    if (coprocessorHost != null) {
      coprocessorHost.postCloseRegionOperation(operation);
    }
  }

  /**
   * This method needs to be called before any public call that reads or
   * modifies stores in bulk. It has to be called just before a try.
   * #closeBulkRegionOperation needs to be called in the try's finally block
   * Acquires a writelock and checks if the region is closing or closed.
   * @throws NotServingRegionException when the region is closing or closed
   * @throws RegionTooBusyException if failed to get the lock in time
   * @throws InterruptedIOException if interrupted while waiting for a lock
   */
  private void startBulkRegionOperation(boolean writeLockNeeded)
      throws NotServingRegionException, RegionTooBusyException, InterruptedIOException {
    if (this.closing.get()) {
      throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing");
    }
    if (writeLockNeeded) lock(lock.writeLock());
    else lock(lock.readLock());
    if (this.closed.get()) {
      if (writeLockNeeded) lock.writeLock().unlock();
      else lock.readLock().unlock();
      throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed");
    }
  }

  /**
   * Closes the lock. This needs to be called in the finally block corresponding
   * to the try block of #startRegionOperation
   */
  private void closeBulkRegionOperation(){
    if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock();
    else lock.readLock().unlock();
  }

  /**
   * Update LongAdders for number of puts without wal and the size of possible data loss.
   * These information are exposed by the region server metrics.
   */
  private void recordMutationWithoutWal(final Map> familyMap) {
    numMutationsWithoutWAL.increment();
    if (numMutationsWithoutWAL.sum() <= 1) {
      LOG.info("writing data to region " + this +
               " with WAL disabled. Data may be lost in the event of a crash.");
    }

    long mutationSize = 0;
    for (List cells: familyMap.values()) {
      // Optimization: 'foreach' loop is not used. See:
      // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects
      assert cells instanceof RandomAccess;
      int listSize = cells.size();
      for (int i=0; i < listSize; i++) {
        Cell cell = cells.get(i);
        mutationSize += cell.getSerializedSize();
      }
    }

    dataInMemoryWithoutWAL.add(mutationSize);
  }

  private void lock(final Lock lock) throws RegionTooBusyException, InterruptedIOException {
    lock(lock, 1);
  }

  /**
   * Try to acquire a lock.  Throw RegionTooBusyException
   * if failed to get the lock in time. Throw InterruptedIOException
   * if interrupted while waiting for the lock.
   */
  private void lock(final Lock lock, final int multiplier)
      throws RegionTooBusyException, InterruptedIOException {
    try {
      final long waitTime = Math.min(maxBusyWaitDuration,
          busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier));
      if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) {
        // Don't print millis. Message is used as a key over in
        // RetriesExhaustedWithDetailsException processing.
        final String regionName =
          this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getRegionNameAsString();
        final String serverName = this.getRegionServerServices() == null ?
          "unknown" : (this.getRegionServerServices().getServerName() == null ?
            "unknown" : this.getRegionServerServices().getServerName().toString());
        RegionTooBusyException rtbe = new RegionTooBusyException(
          "Failed to obtain lock; regionName=" + regionName + ", server=" + serverName);
        LOG.warn("Region is too busy to allow lock acquisition.", rtbe);
        throw rtbe;
      }
    } catch (InterruptedException ie) {
      LOG.info("Interrupted while waiting for a lock in region {}", this);
      InterruptedIOException iie = new InterruptedIOException();
      iie.initCause(ie);
      throw iie;
    }
  }

  /**
   * Calls sync with the given transaction ID
   * @param txid should sync up to which transaction
   * @throws IOException If anything goes wrong with DFS
   */
  private void sync(long txid, Durability durability) throws IOException {
    if (this.getRegionInfo().isMetaRegion()) {
      this.wal.sync(txid);
    } else {
      switch(durability) {
      case USE_DEFAULT:
        // do what table defaults to
        if (shouldSyncWAL()) {
          this.wal.sync(txid);
        }
        break;
      case SKIP_WAL:
        // nothing do to
        break;
      case ASYNC_WAL:
        // nothing do to
        break;
      case SYNC_WAL:
          this.wal.sync(txid, false);
          break;
      case FSYNC_WAL:
          this.wal.sync(txid, true);
          break;
      default:
        throw new RuntimeException("Unknown durability " + durability);
      }
    }
  }

  /**
   * Check whether we should sync the wal from the table's durability settings
   */
  private boolean shouldSyncWAL() {
    return regionDurability.ordinal() >  Durability.ASYNC_WAL.ordinal();
  }

  /**
   * A mocked list implementation - discards all updates.
   */
  private static final List MOCKED_LIST = new AbstractList() {

    @Override
    public void add(int index, Cell element) {
      // do nothing
    }

    @Override
    public boolean addAll(int index, Collection c) {
      return false; // this list is never changed as a result of an update
    }

    @Override
    public KeyValue get(int index) {
      throw new UnsupportedOperationException();
    }

    @Override
    public int size() {
      return 0;
    }
  };

  /** @return the latest sequence number that was read from storage when this region was opened */
  public long getOpenSeqNum() {
    return this.openSeqNum;
  }

  @Override
  public Map getMaxStoreSeqId() {
    return this.maxSeqIdInStores;
  }

  public long getOldestSeqIdOfStore(byte[] familyName) {
    return wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), familyName);
  }

  @Override
  public CompactionState getCompactionState() {
    boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0;
    return (hasMajor ? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR)
        : (hasMinor ? CompactionState.MINOR : CompactionState.NONE));
  }

  public void reportCompactionRequestStart(boolean isMajor){
    (isMajor ? majorInProgress : minorInProgress).incrementAndGet();
  }

  public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted) {
    int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet();

    // metrics
    compactionsFinished.increment();
    compactionNumFilesCompacted.add(numFiles);
    compactionNumBytesCompacted.add(filesSizeCompacted);

    assert newValue >= 0;
  }

  public void reportCompactionRequestFailure() {
    compactionsFailed.increment();
  }

  public void incrementCompactionsQueuedCount() {
    compactionsQueued.increment();
  }

  public void decrementCompactionsQueuedCount() {
    compactionsQueued.decrement();
  }

  public void incrementFlushesQueuedCount() {
    flushesQueued.increment();
  }

  @VisibleForTesting
  public long getReadPoint() {
    return getReadPoint(IsolationLevel.READ_COMMITTED);
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public void onConfigurationChange(Configuration conf) {
    this.storeHotnessProtector.update(conf);
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public void registerChildren(ConfigurationManager manager) {
    configurationManager = manager;
    stores.values().forEach(manager::registerObserver);
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public void deregisterChildren(ConfigurationManager manager) {
    stores.values().forEach(configurationManager::deregisterObserver);
  }

  @Override
  public CellComparator getCellComparator() {
    return cellComparator;
  }

  public long getMemStoreFlushSize() {
    return this.memstoreFlushSize;
  }


  //// method for debugging tests
  void throwException(String title, String regionName) {
    StringBuilder buf = new StringBuilder();
    buf.append(title + ", ");
    buf.append(getRegionInfo().toString());
    buf.append(getRegionInfo().isMetaRegion() ? " meta region " : " ");
    buf.append("stores: ");
    for (HStore s : stores.values()) {
      buf.append(s.getColumnFamilyDescriptor().getNameAsString());
      buf.append(" size: ");
      buf.append(s.getMemStoreSize().getDataSize());
      buf.append(" ");
    }
    buf.append("end-of-stores");
    buf.append(", memstore size ");
    buf.append(getMemStoreDataSize());
    if (getRegionInfo().getRegionNameAsString().startsWith(regionName)) {
      throw new RuntimeException(buf.toString());
    }
  }

  @Override
  public void requestCompaction(String why, int priority, boolean major,
      CompactionLifeCycleTracker tracker) throws IOException {
    if (major) {
      stores.values().forEach(HStore::triggerMajorCompaction);
    }
    rsServices.getCompactionRequestor().requestCompaction(this, why, priority, tracker,
        RpcServer.getRequestUser().orElse(null));
  }

  @Override
  public void requestCompaction(byte[] family, String why, int priority, boolean major,
      CompactionLifeCycleTracker tracker) throws IOException {
    HStore store = stores.get(family);
    if (store == null) {
      throw new NoSuchColumnFamilyException("column family " + Bytes.toString(family) +
          " does not exist in region " + getRegionInfo().getRegionNameAsString());
    }
    if (major) {
      store.triggerMajorCompaction();
    }
    rsServices.getCompactionRequestor().requestCompaction(this, store, why, priority, tracker,
        RpcServer.getRequestUser().orElse(null));
  }

  private void requestFlushIfNeeded() throws RegionTooBusyException {
    if(isFlushSize(this.memStoreSizing.getMemStoreSize())) {
      requestFlush();
    }
  }

  private void requestFlush() {
    if (this.rsServices == null) {
      return;
    }
    requestFlush0(FlushLifeCycleTracker.DUMMY);
  }

  private void requestFlush0(FlushLifeCycleTracker tracker) {
    boolean shouldFlush = false;
    synchronized (writestate) {
      if (!this.writestate.isFlushRequested()) {
        shouldFlush = true;
        writestate.flushRequested = true;
      }
    }
    if (shouldFlush) {
      // Make request outside of synchronize block; HBASE-818.
      this.rsServices.getFlushRequester().requestFlush(this, false, tracker);
      if (LOG.isDebugEnabled()) {
        LOG.debug("Flush requested on " + this.getRegionInfo().getEncodedName());
      }
    } else {
      tracker.notExecuted("Flush already requested on " + this);
    }
  }

  @Override
  public void requestFlush(FlushLifeCycleTracker tracker) throws IOException {
    requestFlush0(tracker);
  }

  /**
   * This method modifies the region's configuration in order to inject replication-related
   * features
   * @param conf region configurations
   */
  static void decorateRegionConfiguration(Configuration conf) {
    if (ReplicationUtils.isReplicationForBulkLoadDataEnabled(conf)) {
      String plugins = conf.get(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY,"");
      String replicationCoprocessorClass = ReplicationObserver.class.getCanonicalName();
      if (!plugins.contains(replicationCoprocessorClass)) {
        conf.set(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY,
            (plugins.equals("") ? "" : (plugins + ",")) + replicationCoprocessorClass);
      }
    }
  }

  @VisibleForTesting
  public void setReadRequestsCount(long readRequestsCount) {
    this.readRequestsCount.add(readRequestsCount);
  }

  @VisibleForTesting
  public void setWriteRequestsCount(long writeRequestsCount) {
    this.writeRequestsCount.add(writeRequestsCount);
  }
}