All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.regionserver.HRegion Maven / Gradle / Ivy

There is a newer version: 3.0.0-beta-1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.regionserver;

import static org.apache.hadoop.hbase.HConstants.REPLICATION_SCOPE_LOCAL;
import static org.apache.hadoop.hbase.regionserver.HStoreFile.MAJOR_COMPACTION_KEY;
import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.REGION_NAMES_KEY;
import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.ROW_LOCK_READ_LOCK_KEY;
import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent;

import com.google.errorprone.annotations.RestrictedApi;
import edu.umd.cs.findbugs.annotations.Nullable;
import io.opentelemetry.api.trace.Span;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.lang.reflect.Constructor;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.NavigableSet;
import java.util.Objects;
import java.util.Optional;
import java.util.RandomAccess;
import java.util.Set;
import java.util.TreeMap;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.LongAdder;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellBuilderType;
import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.CellComparatorImpl;
import org.apache.hadoop.hbase.CellScanner;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.CompareOperator;
import org.apache.hadoop.hbase.CompoundConfiguration;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.DroppedSnapshotException;
import org.apache.hadoop.hbase.ExtendedCellBuilderFactory;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
import org.apache.hadoop.hbase.HDFSBlocksDistribution;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.MetaCellComparator;
import org.apache.hadoop.hbase.NamespaceDescriptor;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.PrivateCellUtil;
import org.apache.hadoop.hbase.RegionTooBusyException;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.Tag;
import org.apache.hadoop.hbase.TagUtil;
import org.apache.hadoop.hbase.client.Append;
import org.apache.hadoop.hbase.client.CheckAndMutate;
import org.apache.hadoop.hbase.client.CheckAndMutateResult;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.CompactionState;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Increment;
import org.apache.hadoop.hbase.client.IsolationLevel;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Row;
import org.apache.hadoop.hbase.client.RowMutations;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.conf.ConfigurationManager;
import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver;
import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
import org.apache.hadoop.hbase.coprocessor.ReadOnlyConfiguration;
import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
import org.apache.hadoop.hbase.exceptions.TimeoutIOException;
import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.ByteArrayComparable;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.io.HFileLink;
import org.apache.hadoop.hbase.io.HeapSize;
import org.apache.hadoop.hbase.io.TimeRange;
import org.apache.hadoop.hbase.io.hfile.BlockCache;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils;
import org.apache.hadoop.hbase.ipc.RpcCall;
import org.apache.hadoop.hbase.ipc.RpcServer;
import org.apache.hadoop.hbase.mob.MobFileCache;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.quotas.RegionServerSpaceQuotaManager;
import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl.WriteEntry;
import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
import org.apache.hadoop.hbase.regionserver.compactions.CompactionLifeCycleTracker;
import org.apache.hadoop.hbase.regionserver.throttle.CompactionThroughputControllerFactory;
import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController;
import org.apache.hadoop.hbase.regionserver.throttle.StoreHotnessProtector;
import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController;
import org.apache.hadoop.hbase.regionserver.wal.WALUtil;
import org.apache.hadoop.hbase.replication.ReplicationUtils;
import org.apache.hadoop.hbase.replication.regionserver.ReplicationObserver;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CancelableProgressable;
import org.apache.hadoop.hbase.util.ClassSize;
import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hbase.util.CoprocessorConfigurationUtil;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.HashedBytes;
import org.apache.hadoop.hbase.util.NonceKey;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
import org.apache.hadoop.hbase.util.TableDescriptorChecker;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.wal.WAL;
import org.apache.hadoop.hbase.wal.WALEdit;
import org.apache.hadoop.hbase.wal.WALFactory;
import org.apache.hadoop.hbase.wal.WALKey;
import org.apache.hadoop.hbase.wal.WALKeyImpl;
import org.apache.hadoop.hbase.wal.WALSplitUtil;
import org.apache.hadoop.hbase.wal.WALSplitUtil.MutationReplay;
import org.apache.hadoop.util.StringUtils;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
import org.apache.hbase.thirdparty.com.google.protobuf.Service;
import org.apache.hbase.thirdparty.com.google.protobuf.TextFormat;
import org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations;
import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils;

import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.CoprocessorServiceCall;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.RegionLoad;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.CompactionDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.FlushAction;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor.EventType;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.StoreDescriptor;

/**
 * Regions store data for a certain region of a table. It stores all columns for each row. A given
 * table consists of one or more Regions.
 * 

* An Region is defined by its table and its key extent. *

* Locking at the Region level serves only one purpose: preventing the region from being closed (and * consequently split) while other operations are ongoing. Each row level operation obtains both a * row lock and a region read lock for the duration of the operation. While a scanner is being * constructed, getScanner holds a read lock. If the scanner is successfully constructed, it holds a * read lock until it is closed. A close takes out a write lock and consequently will block for * ongoing operations and will block new operations from starting while the close is in progress. */ @SuppressWarnings("deprecation") @InterfaceAudience.Private public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region { private static final Logger LOG = LoggerFactory.getLogger(HRegion.class); public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY = "hbase.hregion.scan.loadColumnFamiliesOnDemand"; public static final String HBASE_MAX_CELL_SIZE_KEY = "hbase.server.keyvalue.maxsize"; public static final int DEFAULT_MAX_CELL_SIZE = 10485760; public static final String HBASE_REGIONSERVER_MINIBATCH_SIZE = "hbase.regionserver.minibatch.size"; public static final int DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE = 20000; public static final String WAL_HSYNC_CONF_KEY = "hbase.wal.hsync"; public static final boolean DEFAULT_WAL_HSYNC = false; /** Parameter name for compaction after bulkload */ public static final String COMPACTION_AFTER_BULKLOAD_ENABLE = "hbase.compaction.after.bulkload.enable"; /** Config for allow split when file count greater than the configured blocking file count */ public static final String SPLIT_IGNORE_BLOCKING_ENABLED_KEY = "hbase.hregion.split.ignore.blocking.enabled"; public static final String REGION_STORAGE_POLICY_KEY = "hbase.hregion.block.storage.policy"; public static final String DEFAULT_REGION_STORAGE_POLICY = "NONE"; /** * This is for for using HRegion as a local storage, where we may put the recovered edits in a * special place. Once this is set, we will only replay the recovered edits under this directory * and ignore the original replay directory configs. */ public static final String SPECIAL_RECOVERED_EDITS_DIR = "hbase.hregion.special.recovered.edits.dir"; /** * Whether to use {@link MetaCellComparator} even if we are not meta region. Used when creating * master local region. */ public static final String USE_META_CELL_COMPARATOR = "hbase.region.use.meta.cell.comparator"; public static final boolean DEFAULT_USE_META_CELL_COMPARATOR = false; final AtomicBoolean closed = new AtomicBoolean(false); /* * Closing can take some time; use the closing flag if there is stuff we don't want to do while in * closing state; e.g. like offer this region up to the master as a region to close if the * carrying regionserver is overloaded. Once set, it is never cleared. */ final AtomicBoolean closing = new AtomicBoolean(false); /** * The max sequence id of flushed data on this region. There is no edit in memory that is less * that this sequence id. */ private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM; /** * Record the sequence id of last flush operation. Can be in advance of {@link #maxFlushedSeqId} * when flushing a single column family. In this case, {@link #maxFlushedSeqId} will be older than * the oldest edit in memory. */ private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM; /** * The sequence id of the last replayed open region event from the primary region. This is used to * skip entries before this due to the possibility of replay edits coming out of order from * replication. */ protected volatile long lastReplayedOpenRegionSeqId = -1L; protected volatile long lastReplayedCompactionSeqId = -1L; ////////////////////////////////////////////////////////////////////////////// // Members ////////////////////////////////////////////////////////////////////////////// // map from a locked row to the context for that lock including: // - CountDownLatch for threads waiting on that row // - the thread that owns the lock (allow reentrancy) // - reference count of (reentrant) locks held by the thread // - the row itself private final ConcurrentHashMap lockedRows = new ConcurrentHashMap<>(); protected final Map stores = new ConcurrentSkipListMap<>(Bytes.BYTES_RAWCOMPARATOR); // TODO: account for each registered handler in HeapSize computation private Map coprocessorServiceHandlers = Maps.newHashMap(); // Track data size in all memstores private final MemStoreSizing memStoreSizing = new ThreadSafeMemStoreSizing(); RegionServicesForStores regionServicesForStores; // Debug possible data loss due to WAL off final LongAdder numMutationsWithoutWAL = new LongAdder(); final LongAdder dataInMemoryWithoutWAL = new LongAdder(); // Debug why CAS operations are taking a while. final LongAdder checkAndMutateChecksPassed = new LongAdder(); final LongAdder checkAndMutateChecksFailed = new LongAdder(); // Number of requests // Count rows for scan final LongAdder readRequestsCount = new LongAdder(); final LongAdder filteredReadRequestsCount = new LongAdder(); // Count rows for multi row mutations final LongAdder writeRequestsCount = new LongAdder(); // Number of requests blocked by memstore size. private final LongAdder blockedRequestsCount = new LongAdder(); // Compaction LongAdders final LongAdder compactionsFinished = new LongAdder(); final LongAdder compactionsFailed = new LongAdder(); final LongAdder compactionNumFilesCompacted = new LongAdder(); final LongAdder compactionNumBytesCompacted = new LongAdder(); final LongAdder compactionsQueued = new LongAdder(); final LongAdder flushesQueued = new LongAdder(); private BlockCache blockCache; private MobFileCache mobFileCache; private final WAL wal; private final HRegionFileSystem fs; protected final Configuration conf; private final Configuration baseConf; private final int rowLockWaitDuration; static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000; private Path regionWalDir; private FileSystem walFS; // set to true if the region is restored from snapshot private boolean isRestoredRegion = false; public void setRestoredRegion(boolean restoredRegion) { isRestoredRegion = restoredRegion; } // The internal wait duration to acquire a lock before read/update // from the region. It is not per row. The purpose of this wait time // is to avoid waiting a long time while the region is busy, so that // we can release the IPC handler soon enough to improve the // availability of the region server. It can be adjusted by // tuning configuration "hbase.busy.wait.duration". final long busyWaitDuration; static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT; // If updating multiple rows in one call, wait longer, // i.e. waiting for busyWaitDuration * # of rows. However, // we can limit the max multiplier. final int maxBusyWaitMultiplier; // Max busy wait duration. There is no point to wait longer than the RPC // purge timeout, when a RPC call will be terminated by the RPC engine. final long maxBusyWaitDuration; // Max cell size. If nonzero, the maximum allowed size for any given cell // in bytes final long maxCellSize; // Number of mutations for minibatch processing. private final int miniBatchSize; // negative number indicates infinite timeout static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L; final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool(); final ConcurrentHashMap scannerReadPoints; final ReadPointCalculationLock smallestReadPointCalcLock; /** * The sequence ID that was enLongAddered when this region was opened. */ private long openSeqNum = HConstants.NO_SEQNUM; /** * The default setting for whether to enable on-demand CF loading for scan requests to this * region. Requests can override it. */ private boolean isLoadingCfsOnDemandDefault = false; private final AtomicInteger majorInProgress = new AtomicInteger(0); private final AtomicInteger minorInProgress = new AtomicInteger(0); // // Context: During replay we want to ensure that we do not lose any data. So, we // have to be conservative in how we replay wals. For each store, we calculate // the maxSeqId up to which the store was flushed. And, skip the edits which // are equal to or lower than maxSeqId for each store. // The following map is populated when opening the region Map maxSeqIdInStores = new TreeMap<>(Bytes.BYTES_COMPARATOR); /** Saved state from replaying prepare flush cache */ private PrepareFlushResult prepareFlushResult = null; private volatile ConfigurationManager configurationManager; // Used for testing. private volatile Long timeoutForWriteLock = null; private final CellComparator cellComparator; /** * @return The smallest mvcc readPoint across all the scanners in this region. Writes older than * this readPoint, are included in every read operation. */ public long getSmallestReadPoint() { // We need to ensure that while we are calculating the smallestReadPoint // no new RegionScanners can grab a readPoint that we are unaware of. smallestReadPointCalcLock.lock(ReadPointCalculationLock.LockType.CALCULATION_LOCK); try { long minimumReadPoint = mvcc.getReadPoint(); for (Long readPoint : this.scannerReadPoints.values()) { minimumReadPoint = Math.min(minimumReadPoint, readPoint); } return minimumReadPoint; } finally { smallestReadPointCalcLock.unlock(ReadPointCalculationLock.LockType.CALCULATION_LOCK); } } /* * Data structure of write state flags used coordinating flushes, compactions and closes. */ static class WriteState { // Set while a memstore flush is happening. volatile boolean flushing = false; // Set when a flush has been requested. volatile boolean flushRequested = false; // Number of compactions running. AtomicInteger compacting = new AtomicInteger(0); // Gets set in close. If set, cannot compact or flush again. volatile boolean writesEnabled = true; // Set if region is read-only volatile boolean readOnly = false; // whether the reads are enabled. This is different than readOnly, because readOnly is // static in the lifetime of the region, while readsEnabled is dynamic volatile boolean readsEnabled = true; /** * Set flags that make this region read-only. * @param onOff flip value for region r/o setting */ synchronized void setReadOnly(final boolean onOff) { this.writesEnabled = !onOff; this.readOnly = onOff; } boolean isReadOnly() { return this.readOnly; } boolean isFlushRequested() { return this.flushRequested; } void setReadsEnabled(boolean readsEnabled) { this.readsEnabled = readsEnabled; } static final long HEAP_SIZE = ClassSize.align(ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN); } /** * Objects from this class are created when flushing to describe all the different states that * that method ends up in. The Result enum describes those states. The sequence id should only be * specified if the flush was successful, and the failure message should only be specified if it * didn't flush. */ public static class FlushResultImpl implements FlushResult { final Result result; final String failureReason; final long flushSequenceId; final boolean wroteFlushWalMarker; /** * Convenience constructor to use when the flush is successful, the failure message is set to * null. * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED. * @param flushSequenceId Generated sequence id that comes right after the edits in the * memstores. */ FlushResultImpl(Result result, long flushSequenceId) { this(result, flushSequenceId, null, false); assert result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result.FLUSHED_COMPACTION_NEEDED; } /** * Convenience constructor to use when we cannot flush. * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH. * @param failureReason Reason why we couldn't flush. */ FlushResultImpl(Result result, String failureReason, boolean wroteFlushMarker) { this(result, -1, failureReason, wroteFlushMarker); assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH; } /** * Constructor with all the parameters. * @param result Any of the Result. * @param flushSequenceId Generated sequence id if the memstores were flushed else -1. * @param failureReason Reason why we couldn't flush, or null. */ FlushResultImpl(Result result, long flushSequenceId, String failureReason, boolean wroteFlushMarker) { this.result = result; this.flushSequenceId = flushSequenceId; this.failureReason = failureReason; this.wroteFlushWalMarker = wroteFlushMarker; } /** * Convenience method, the equivalent of checking if result is FLUSHED_NO_COMPACTION_NEEDED or * FLUSHED_NO_COMPACTION_NEEDED. * @return true if the memstores were flushed, else false. */ @Override public boolean isFlushSucceeded() { return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result.FLUSHED_COMPACTION_NEEDED; } /** * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED. * @return True if the flush requested a compaction, else false (doesn't even mean it flushed). */ @Override public boolean isCompactionNeeded() { return result == Result.FLUSHED_COMPACTION_NEEDED; } @Override public String toString() { return new StringBuilder().append("flush result:").append(result).append(", ") .append("failureReason:").append(failureReason).append(",").append("flush seq id") .append(flushSequenceId).toString(); } @Override public Result getResult() { return result; } } /** A result object from prepare flush cache stage */ static class PrepareFlushResult { final FlushResultImpl result; // indicating a failure result from prepare final TreeMap storeFlushCtxs; final TreeMap> committedFiles; final TreeMap storeFlushableSize; final long startTime; final long flushOpSeqId; final long flushedSeqId; final MemStoreSizing totalFlushableSize; /** Constructs an early exit case */ PrepareFlushResult(FlushResultImpl result, long flushSeqId) { this(result, null, null, null, Math.max(0, flushSeqId), 0, 0, MemStoreSizing.DUD); } /** Constructs a successful prepare flush result */ PrepareFlushResult(TreeMap storeFlushCtxs, TreeMap> committedFiles, TreeMap storeFlushableSize, long startTime, long flushSeqId, long flushedSeqId, MemStoreSizing totalFlushableSize) { this(null, storeFlushCtxs, committedFiles, storeFlushableSize, startTime, flushSeqId, flushedSeqId, totalFlushableSize); } private PrepareFlushResult(FlushResultImpl result, TreeMap storeFlushCtxs, TreeMap> committedFiles, TreeMap storeFlushableSize, long startTime, long flushSeqId, long flushedSeqId, MemStoreSizing totalFlushableSize) { this.result = result; this.storeFlushCtxs = storeFlushCtxs; this.committedFiles = committedFiles; this.storeFlushableSize = storeFlushableSize; this.startTime = startTime; this.flushOpSeqId = flushSeqId; this.flushedSeqId = flushedSeqId; this.totalFlushableSize = totalFlushableSize; } public FlushResult getResult() { return this.result; } } /** * A class that tracks exceptions that have been observed in one batch. Not thread safe. */ static class ObservedExceptionsInBatch { private boolean wrongRegion = false; private boolean failedSanityCheck = false; private boolean wrongFamily = false; /** Returns If a {@link WrongRegionException} has been observed. */ boolean hasSeenWrongRegion() { return wrongRegion; } /** * Records that a {@link WrongRegionException} has been observed. */ void sawWrongRegion() { wrongRegion = true; } /** Returns If a {@link FailedSanityCheckException} has been observed. */ boolean hasSeenFailedSanityCheck() { return failedSanityCheck; } /** * Records that a {@link FailedSanityCheckException} has been observed. */ void sawFailedSanityCheck() { failedSanityCheck = true; } /** Returns If a {@link NoSuchColumnFamilyException} has been observed. */ boolean hasSeenNoSuchFamily() { return wrongFamily; } /** * Records that a {@link NoSuchColumnFamilyException} has been observed. */ void sawNoSuchFamily() { wrongFamily = true; } } final WriteState writestate = new WriteState(); long memstoreFlushSize; final long timestampSlop; final long rowProcessorTimeout; // Last flush time for each Store. Useful when we are flushing for each column private final ConcurrentMap lastStoreFlushTimeMap = new ConcurrentHashMap<>(); protected RegionServerServices rsServices; private RegionServerAccounting rsAccounting; private long flushCheckInterval; // flushPerChanges is to prevent too many changes in memstore private long flushPerChanges; private long blockingMemStoreSize; // Used to guard closes final ReentrantReadWriteLock lock; // Used to track interruptible holders of the region lock. Currently that is only RPC handler // threads. Boolean value in map determines if lock holder can be interrupted, normally true, // but may be false when thread is transiting a critical section. final ConcurrentHashMap regionLockHolders; // Stop updates lock private final ReentrantReadWriteLock updatesLock = new ReentrantReadWriteLock(); private final MultiVersionConcurrencyControl mvcc; // Coprocessor host private volatile RegionCoprocessorHost coprocessorHost; private TableDescriptor htableDescriptor = null; private RegionSplitPolicy splitPolicy; private RegionSplitRestriction splitRestriction; private FlushPolicy flushPolicy; private final MetricsRegion metricsRegion; private final MetricsRegionWrapperImpl metricsRegionWrapper; private final Durability regionDurability; private final boolean regionStatsEnabled; // Stores the replication scope of the various column families of the table // that has non-default scope private final NavigableMap replicationScope = new TreeMap<>(Bytes.BYTES_COMPARATOR); private final StoreHotnessProtector storeHotnessProtector; /** * HRegion constructor. This constructor should only be used for testing and extensions. Instances * of HRegion should be instantiated with the {@link HRegion#createHRegion} or * {@link HRegion#openHRegion} method. * @param tableDir qualified path of directory where region should be located, usually the table * directory. * @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a * logfile from the previous execution that's custom-computed for this HRegion. * The HRegionServer computes and sorts the appropriate wal info for this * HRegion. If there is a previous wal file (implying that the HRegion has been * written-to before), then read it from the supplied path. * @param fs is the filesystem. * @param confParam is global configuration settings. * @param regionInfo - RegionInfo that describes the region is new), then read them from the * supplied path. * @param htd the table descriptor * @param rsServices reference to {@link RegionServerServices} or null * @deprecated Use other constructors. */ @Deprecated public HRegion(final Path tableDir, final WAL wal, final FileSystem fs, final Configuration confParam, final RegionInfo regionInfo, final TableDescriptor htd, final RegionServerServices rsServices) { this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo), wal, confParam, htd, rsServices); } /** * HRegion constructor. This constructor should only be used for testing and extensions. Instances * of HRegion should be instantiated with the {@link HRegion#createHRegion} or * {@link HRegion#openHRegion} method. * @param fs is the filesystem. * @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a * logfile from the previous execution that's custom-computed for this HRegion. * The HRegionServer computes and sorts the appropriate wal info for this * HRegion. If there is a previous wal file (implying that the HRegion has been * written-to before), then read it from the supplied path. * @param confParam is global configuration settings. * @param htd the table descriptor * @param rsServices reference to {@link RegionServerServices} or null */ public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam, final TableDescriptor htd, final RegionServerServices rsServices) { if (htd == null) { throw new IllegalArgumentException("Need table descriptor"); } if (confParam instanceof CompoundConfiguration) { throw new IllegalArgumentException("Need original base configuration"); } this.wal = wal; this.fs = fs; this.mvcc = new MultiVersionConcurrencyControl(getRegionInfo().getShortNameToLog()); // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor this.baseConf = confParam; this.conf = new CompoundConfiguration().add(confParam).addBytesMap(htd.getValues()); this.cellComparator = htd.isMetaTable() || conf.getBoolean(USE_META_CELL_COMPARATOR, DEFAULT_USE_META_CELL_COMPARATOR) ? MetaCellComparator.META_COMPARATOR : CellComparatorImpl.COMPARATOR; this.lock = new ReentrantReadWriteLock( conf.getBoolean(FAIR_REENTRANT_CLOSE_LOCK, DEFAULT_FAIR_REENTRANT_CLOSE_LOCK)); this.regionLockHolders = new ConcurrentHashMap<>(); this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL, DEFAULT_CACHE_FLUSH_INTERVAL); this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES); if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) { throw new IllegalArgumentException( MEMSTORE_FLUSH_PER_CHANGES + " can not exceed " + MAX_FLUSH_PER_CHANGES); } int tmpRowLockDuration = conf.getInt("hbase.rowlock.wait.duration", DEFAULT_ROWLOCK_WAIT_DURATION); if (tmpRowLockDuration <= 0) { LOG.info("Found hbase.rowlock.wait.duration set to {}. values <= 0 will cause all row " + "locking to fail. Treating it as 1ms to avoid region failure.", tmpRowLockDuration); tmpRowLockDuration = 1; } this.rowLockWaitDuration = tmpRowLockDuration; this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true); this.htableDescriptor = htd; Set families = this.htableDescriptor.getColumnFamilyNames(); for (byte[] family : families) { if (!replicationScope.containsKey(family)) { int scope = htd.getColumnFamily(family).getScope(); // Only store those families that has NON-DEFAULT scope if (scope != REPLICATION_SCOPE_LOCAL) { // Do a copy before storing it here. replicationScope.put(Bytes.copy(family), scope); } } } this.rsServices = rsServices; if (rsServices != null) { this.blockCache = rsServices.getBlockCache().orElse(null); this.mobFileCache = rsServices.getMobFileCache().orElse(null); } this.regionServicesForStores = new RegionServicesForStores(this, rsServices); setHTableSpecificConf(); this.scannerReadPoints = new ConcurrentHashMap<>(); this.smallestReadPointCalcLock = new ReadPointCalculationLock(conf); this.busyWaitDuration = conf.getLong("hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION); this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2); if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) { throw new IllegalArgumentException("Invalid hbase.busy.wait.duration (" + busyWaitDuration + ") or hbase.busy.wait.multiplier.max (" + maxBusyWaitMultiplier + "). Their product should be positive"); } this.maxBusyWaitDuration = conf.getLong("hbase.ipc.client.call.purge.timeout", 2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT); /* * timestamp.slop provides a server-side constraint on the timestamp. This assumes that you base * your TS around EnvironmentEdgeManager.currentTime(). In this case, throw an error to the user * if the user-specified TS is newer than now + slop. LATEST_TIMESTAMP == don't use this * functionality */ this.timestampSlop = conf.getLong("hbase.hregion.keyvalue.timestamp.slop.millisecs", HConstants.LATEST_TIMESTAMP); /** * Timeout for the process time in processRowsWithLocks(). Use -1 to switch off time bound. */ this.rowProcessorTimeout = conf.getLong("hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT); this.storeHotnessProtector = new StoreHotnessProtector(this, conf); boolean forceSync = conf.getBoolean(WAL_HSYNC_CONF_KEY, DEFAULT_WAL_HSYNC); /** * This is the global default value for durability. All tables/mutations not defining a * durability or using USE_DEFAULT will default to this value. */ Durability defaultDurability = forceSync ? Durability.FSYNC_WAL : Durability.SYNC_WAL; this.regionDurability = this.htableDescriptor.getDurability() == Durability.USE_DEFAULT ? defaultDurability : this.htableDescriptor.getDurability(); decorateRegionConfiguration(conf); if (rsServices != null) { this.rsAccounting = this.rsServices.getRegionServerAccounting(); // don't initialize coprocessors if not running within a regionserver // TODO: revisit if coprocessors should load in other cases this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf); this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this); this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper, conf); } else { this.metricsRegionWrapper = null; this.metricsRegion = null; } if (LOG.isDebugEnabled()) { // Write out region name, its encoded name and storeHotnessProtector as string. LOG.debug("Instantiated " + this + "; " + storeHotnessProtector.toString()); } configurationManager = null; // disable stats tracking system tables, but check the config for everything else this.regionStatsEnabled = htd.getTableName().getNamespaceAsString() .equals(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) ? false : conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE, HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE); this.maxCellSize = conf.getLong(HBASE_MAX_CELL_SIZE_KEY, DEFAULT_MAX_CELL_SIZE); this.miniBatchSize = conf.getInt(HBASE_REGIONSERVER_MINIBATCH_SIZE, DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE); // recover the metrics of read and write requests count if they were retained if (rsServices != null && rsServices.getRegionServerAccounting() != null) { Pair retainedRWRequestsCnt = rsServices.getRegionServerAccounting() .getRetainedRegionRWRequestsCnt().get(getRegionInfo().getEncodedName()); if (retainedRWRequestsCnt != null) { this.addReadRequestsCount(retainedRWRequestsCnt.getFirst()); this.addWriteRequestsCount(retainedRWRequestsCnt.getSecond()); // remove them since won't use again rsServices.getRegionServerAccounting().getRetainedRegionRWRequestsCnt() .remove(getRegionInfo().getEncodedName()); } } } private void setHTableSpecificConf() { if (this.htableDescriptor == null) { return; } long flushSize = this.htableDescriptor.getMemStoreFlushSize(); if (flushSize <= 0) { flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE, TableDescriptorBuilder.DEFAULT_MEMSTORE_FLUSH_SIZE); } this.memstoreFlushSize = flushSize; long mult = conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER, HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER); this.blockingMemStoreSize = this.memstoreFlushSize * mult; } /** * Initialize this region. Used only by tests and SplitTransaction to reopen the region. You * should use createHRegion() or openHRegion() * @return What the next sequence (edit) id should be. * @throws IOException e * @deprecated use HRegion.createHRegion() or HRegion.openHRegion() */ @Deprecated public long initialize() throws IOException { return initialize(null); } /** * Initialize this region. * @param reporter Tickle every so often if initialize is taking a while. * @return What the next sequence (edit) id should be. */ long initialize(final CancelableProgressable reporter) throws IOException { // Refuse to open the region if there is no column family in the table if (htableDescriptor.getColumnFamilyCount() == 0) { throw new DoNotRetryIOException("Table " + htableDescriptor.getTableName().getNameAsString() + " should have at least one column family."); } MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this, false, true); long nextSeqId = -1; try { nextSeqId = initializeRegionInternals(reporter, status); return nextSeqId; } catch (IOException e) { LOG.warn("Failed initialize of region= {}, starting to roll back memstore", getRegionInfo().getRegionNameAsString(), e); // global memstore size will be decreased when dropping memstore try { // drop the memory used by memstore if open region fails dropMemStoreContents(); } catch (IOException ioE) { if (conf.getBoolean(MemStoreLAB.USEMSLAB_KEY, MemStoreLAB.USEMSLAB_DEFAULT)) { LOG.warn( "Failed drop memstore of region= {}, " + "some chunks may not released forever since MSLAB is enabled", getRegionInfo().getRegionNameAsString()); } } throw e; } finally { // nextSeqid will be -1 if the initialization fails. // At least it will be 0 otherwise. if (nextSeqId == -1) { status.abort("Exception during region " + getRegionInfo().getRegionNameAsString() + " initialization."); } if (LOG.isDebugEnabled()) { LOG.debug("Region open journal for {}:\n{}", this.getRegionInfo().getEncodedName(), status.prettyPrintJournal()); } status.cleanup(); } } private long initializeRegionInternals(final CancelableProgressable reporter, final MonitoredTask status) throws IOException { if (coprocessorHost != null) { status.setStatus("Running coprocessor pre-open hook"); coprocessorHost.preOpen(); } String policyName = this.conf.get(REGION_STORAGE_POLICY_KEY, DEFAULT_REGION_STORAGE_POLICY); this.fs.setStoragePolicy(policyName.trim()); // Write HRI to a file in case we need to recover hbase:meta // Only the primary replica should write .regioninfo if (this.getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID) { status.setStatus("Writing region info on filesystem"); fs.checkRegionInfoOnFilesystem(); } // Initialize all the HStores status.setStatus("Initializing all the Stores"); long maxSeqId = initializeStores(reporter, status); this.mvcc.advanceTo(maxSeqId); if (!isRestoredRegion && ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this)) { Collection stores = this.stores.values(); try { // update the stores that we are replaying stores.forEach(HStore::startReplayingFromWAL); // Recover any edits if available. maxSeqId = Math.max(maxSeqId, replayRecoveredEditsIfAny(maxSeqIdInStores, reporter, status)); // Recover any hfiles if available maxSeqId = Math.max(maxSeqId, loadRecoveredHFilesIfAny(stores)); // Make sure mvcc is up to max. this.mvcc.advanceTo(maxSeqId); } finally { // update the stores that we are done replaying stores.forEach(HStore::stopReplayingFromWAL); } } this.lastReplayedOpenRegionSeqId = maxSeqId; this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this)); this.writestate.flushRequested = false; this.writestate.compacting.set(0); if (this.writestate.writesEnabled) { // Remove temporary data left over from old regions status.setStatus("Cleaning up temporary data from old regions"); fs.cleanupTempDir(); } // Initialize split policy this.splitPolicy = RegionSplitPolicy.create(this, conf); // Initialize split restriction splitRestriction = RegionSplitRestriction.create(getTableDescriptor(), conf); // Initialize flush policy this.flushPolicy = FlushPolicyFactory.create(this, conf); long lastFlushTime = EnvironmentEdgeManager.currentTime(); for (HStore store : stores.values()) { this.lastStoreFlushTimeMap.put(store, lastFlushTime); } // Use maximum of log sequenceid or that which was found in stores // (particularly if no recovered edits, seqid will be -1). long nextSeqId = maxSeqId + 1; if (!isRestoredRegion) { // always get openSeqNum from the default replica, even if we are secondary replicas long maxSeqIdFromFile = WALSplitUtil.getMaxRegionSequenceId(conf, RegionReplicaUtil.getRegionInfoForDefaultReplica(getRegionInfo()), this::getFilesystem, this::getWalFileSystem); nextSeqId = Math.max(maxSeqId, maxSeqIdFromFile) + 1; // The openSeqNum will always be increase even for read only region, as we rely on it to // determine whether a region has been successfully reopened, so here we always need to update // the max sequence id file. if (RegionReplicaUtil.isDefaultReplica(getRegionInfo())) { LOG.debug("writing seq id for {}", this.getRegionInfo().getEncodedName()); WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(), nextSeqId - 1); // This means we have replayed all the recovered edits and also written out the max sequence // id file, let's delete the wrong directories introduced in HBASE-20734, see HBASE-22617 // for more details. Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf, getRegionInfo().getTable(), getRegionInfo().getEncodedName()); FileSystem walFs = getWalFileSystem(); if (walFs.exists(wrongRegionWALDir)) { if (!walFs.delete(wrongRegionWALDir, true)) { LOG.debug("Failed to clean up wrong region WAL directory {}", wrongRegionWALDir); } } } } LOG.info("Opened {}; next sequenceid={}; {}, {}", this.getRegionInfo().getShortNameToLog(), nextSeqId, this.splitPolicy, this.flushPolicy); // A region can be reopened if failed a split; reset flags this.closing.set(false); this.closed.set(false); if (coprocessorHost != null) { status.setStatus("Running coprocessor post-open hooks"); coprocessorHost.postOpen(); } status.markComplete("Region opened successfully"); return nextSeqId; } /** * Open all Stores. * @return Highest sequenceId found out in a Store. */ private long initializeStores(CancelableProgressable reporter, MonitoredTask status) throws IOException { return initializeStores(reporter, status, false); } private long initializeStores(CancelableProgressable reporter, MonitoredTask status, boolean warmup) throws IOException { // Load in all the HStores. long maxSeqId = -1; // initialized to -1 so that we pick up MemstoreTS from column families long maxMemstoreTS = -1; if (htableDescriptor.getColumnFamilyCount() != 0) { // initialize the thread pool for opening stores in parallel. ThreadPoolExecutor storeOpenerThreadPool = getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog()); CompletionService completionService = new ExecutorCompletionService<>(storeOpenerThreadPool); // initialize each store in parallel for (final ColumnFamilyDescriptor family : htableDescriptor.getColumnFamilies()) { status.setStatus("Instantiating store for column family " + family); completionService.submit(new Callable() { @Override public HStore call() throws IOException { return instantiateHStore(family, warmup); } }); } boolean allStoresOpened = false; boolean hasSloppyStores = false; try { for (int i = 0; i < htableDescriptor.getColumnFamilyCount(); i++) { Future future = completionService.take(); HStore store = future.get(); this.stores.put(store.getColumnFamilyDescriptor().getName(), store); if (store.isSloppyMemStore()) { hasSloppyStores = true; } long storeMaxSequenceId = store.getMaxSequenceId().orElse(0L); maxSeqIdInStores.put(Bytes.toBytes(store.getColumnFamilyName()), storeMaxSequenceId); if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) { maxSeqId = storeMaxSequenceId; } long maxStoreMemstoreTS = store.getMaxMemStoreTS().orElse(0L); if (maxStoreMemstoreTS > maxMemstoreTS) { maxMemstoreTS = maxStoreMemstoreTS; } } allStoresOpened = true; if (hasSloppyStores) { htableDescriptor = TableDescriptorBuilder.newBuilder(htableDescriptor) .setFlushPolicyClassName(FlushNonSloppyStoresFirstPolicy.class.getName()).build(); LOG.info("Setting FlushNonSloppyStoresFirstPolicy for the region=" + this); } } catch (InterruptedException e) { throw throwOnInterrupt(e); } catch (ExecutionException e) { throw new IOException(e.getCause()); } finally { storeOpenerThreadPool.shutdownNow(); if (!allStoresOpened) { // something went wrong, close all opened stores LOG.error("Could not initialize all stores for the region=" + this); for (HStore store : this.stores.values()) { try { store.close(); } catch (IOException e) { LOG.warn("close store {} failed in region {}", store.toString(), this, e); } } } } } return Math.max(maxSeqId, maxMemstoreTS + 1); } private void initializeWarmup(final CancelableProgressable reporter) throws IOException { MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this); // Initialize all the HStores status.setStatus("Warmup all stores of " + this.getRegionInfo().getRegionNameAsString()); try { initializeStores(reporter, status, true); } finally { status.markComplete("Warmed up " + this.getRegionInfo().getRegionNameAsString()); } } /** Returns Map of StoreFiles by column family */ private NavigableMap> getStoreFiles() { NavigableMap> allStoreFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); for (HStore store : stores.values()) { Collection storeFiles = store.getStorefiles(); if (storeFiles == null) { continue; } List storeFileNames = new ArrayList<>(); for (HStoreFile storeFile : storeFiles) { storeFileNames.add(storeFile.getPath()); } allStoreFiles.put(store.getColumnFamilyDescriptor().getName(), storeFileNames); } return allStoreFiles; } protected void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException { Map> storeFiles = getStoreFiles(); RegionEventDescriptor regionOpenDesc = ProtobufUtil.toRegionEventDescriptor(RegionEventDescriptor.EventType.REGION_OPEN, getRegionInfo(), openSeqId, getRegionServerServices().getServerName(), storeFiles); WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionOpenDesc, mvcc); } private void writeRegionCloseMarker(WAL wal) throws IOException { Map> storeFiles = getStoreFiles(); RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor( RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), mvcc.getReadPoint(), getRegionServerServices().getServerName(), storeFiles); WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionEventDesc, mvcc); // Store SeqId in WAL FileSystem when a region closes // checking region folder exists is due to many tests which delete the table folder while a // table is still online if (getWalFileSystem().exists(getWALRegionDir())) { WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(), mvcc.getReadPoint()); } } /** Returns True if this region has references. */ public boolean hasReferences() { return stores.values().stream().anyMatch(HStore::hasReferences); } public void blockUpdates() { this.updatesLock.writeLock().lock(); } public void unblockUpdates() { this.updatesLock.writeLock().unlock(); } public HDFSBlocksDistribution getHDFSBlocksDistribution() { HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution(); stores.values().stream().filter(s -> s.getStorefiles() != null) .flatMap(s -> s.getStorefiles().stream()).map(HStoreFile::getHDFSBlockDistribution) .forEachOrdered(hdfsBlocksDistribution::add); return hdfsBlocksDistribution; } /** * This is a helper function to compute HDFS block distribution on demand * @param conf configuration * @param tableDescriptor TableDescriptor of the table * @param regionInfo encoded name of the region * @return The HDFS blocks distribution for the given region. */ public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf, TableDescriptor tableDescriptor, RegionInfo regionInfo) throws IOException { Path tablePath = CommonFSUtils.getTableDir(CommonFSUtils.getRootDir(conf), tableDescriptor.getTableName()); return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath); } /** * This is a helper function to compute HDFS block distribution on demand * @param conf configuration * @param tableDescriptor TableDescriptor of the table * @param regionInfo encoded name of the region * @param tablePath the table directory * @return The HDFS blocks distribution for the given region. */ public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf, TableDescriptor tableDescriptor, RegionInfo regionInfo, Path tablePath) throws IOException { HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution(); FileSystem fs = tablePath.getFileSystem(conf); HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo); for (ColumnFamilyDescriptor family : tableDescriptor.getColumnFamilies()) { List locatedFileStatusList = HRegionFileSystem.getStoreFilesLocatedStatus(regionFs, family.getNameAsString(), true); if (locatedFileStatusList == null) { continue; } for (LocatedFileStatus status : locatedFileStatusList) { Path p = status.getPath(); if (StoreFileInfo.isReference(p) || HFileLink.isHFileLink(p)) { // Only construct StoreFileInfo object if its not a hfile, save obj // creation StoreFileInfo storeFileInfo = new StoreFileInfo(conf, fs, status); hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs)); } else if (StoreFileInfo.isHFile(p)) { // If its a HFile, then lets just add to the block distribution // lets not create more objects here, not even another HDFSBlocksDistribution FSUtils.addToHDFSBlocksDistribution(hdfsBlocksDistribution, status.getBlockLocations()); } else { throw new IOException("path=" + p + " doesn't look like a valid StoreFile"); } } } return hdfsBlocksDistribution; } /** * Increase the size of mem store in this region and the size of global mem store */ private void incMemStoreSize(MemStoreSize mss) { incMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(), mss.getCellsCount()); } void incMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta, int cellsCountDelta) { if (this.rsAccounting != null) { rsAccounting.incGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta); } long dataSize = this.memStoreSizing.incMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta, cellsCountDelta); checkNegativeMemStoreDataSize(dataSize, dataSizeDelta); } void decrMemStoreSize(MemStoreSize mss) { decrMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(), mss.getCellsCount()); } private void decrMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta, int cellsCountDelta) { if (this.rsAccounting != null) { rsAccounting.decGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta); } long dataSize = this.memStoreSizing.decMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta, cellsCountDelta); checkNegativeMemStoreDataSize(dataSize, -dataSizeDelta); } private void checkNegativeMemStoreDataSize(long memStoreDataSize, long delta) { // This is extremely bad if we make memStoreSizing negative. Log as much info on the offending // caller as possible. (memStoreSizing might be a negative value already -- freeing memory) if (memStoreDataSize < 0) { LOG.error("Asked to modify this region's (" + this.toString() + ") memStoreSizing to a negative value which is incorrect. Current memStoreSizing=" + (memStoreDataSize - delta) + ", delta=" + delta, new Exception()); } } @Override public RegionInfo getRegionInfo() { return this.fs.getRegionInfo(); } /** Returns Instance of {@link RegionServerServices} used by this HRegion. Can be null. */ RegionServerServices getRegionServerServices() { return this.rsServices; } @Override public long getReadRequestsCount() { return readRequestsCount.sum(); } @Override public long getFilteredReadRequestsCount() { return filteredReadRequestsCount.sum(); } @Override public long getWriteRequestsCount() { return writeRequestsCount.sum(); } @Override public long getMemStoreDataSize() { return memStoreSizing.getDataSize(); } @Override public long getMemStoreHeapSize() { return memStoreSizing.getHeapSize(); } @Override public long getMemStoreOffHeapSize() { return memStoreSizing.getOffHeapSize(); } /** Returns store services for this region, to access services required by store level needs */ public RegionServicesForStores getRegionServicesForStores() { return regionServicesForStores; } @Override public long getNumMutationsWithoutWAL() { return numMutationsWithoutWAL.sum(); } @Override public long getDataInMemoryWithoutWAL() { return dataInMemoryWithoutWAL.sum(); } @Override public long getBlockedRequestsCount() { return blockedRequestsCount.sum(); } @Override public long getCheckAndMutateChecksPassed() { return checkAndMutateChecksPassed.sum(); } @Override public long getCheckAndMutateChecksFailed() { return checkAndMutateChecksFailed.sum(); } // TODO Needs to check whether we should expose our metrics system to CPs. If CPs themselves doing // the op and bypassing the core, this might be needed? Should be stop supporting the bypass // feature? public MetricsRegion getMetrics() { return metricsRegion; } @Override public boolean isClosed() { return this.closed.get(); } @Override public boolean isClosing() { return this.closing.get(); } @Override public boolean isReadOnly() { return this.writestate.isReadOnly(); } @Override public boolean isAvailable() { return !isClosed() && !isClosing(); } @Override public boolean isSplittable() { return splitPolicy.canSplit(); } @Override public boolean isMergeable() { if (!isAvailable()) { LOG.debug("Region " + this + " is not mergeable because it is closing or closed"); return false; } if (hasReferences()) { LOG.debug("Region " + this + " is not mergeable because it has references"); return false; } return true; } public boolean areWritesEnabled() { synchronized (this.writestate) { return this.writestate.writesEnabled; } } public MultiVersionConcurrencyControl getMVCC() { return mvcc; } @Override public long getMaxFlushedSeqId() { return maxFlushedSeqId; } /** Returns readpoint considering given IsolationLevel. Pass {@code null} for default */ public long getReadPoint(IsolationLevel isolationLevel) { if (isolationLevel != null && isolationLevel == IsolationLevel.READ_UNCOMMITTED) { // This scan can read even uncommitted transactions return Long.MAX_VALUE; } return mvcc.getReadPoint(); } public boolean isLoadingCfsOnDemandDefault() { return this.isLoadingCfsOnDemandDefault; } /** * Close down this HRegion. Flush the cache, shut down each HStore, don't service any more calls. *

* This method could take some time to execute, so don't call it from a time-sensitive thread. * @return Vector of all the storage files that the HRegion's component HStores make use of. It's * a list of all StoreFile objects. Returns empty vector if already closed and null if * judged that it should not close. * @throws IOException e * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was * not properly persisted. The region is put in closing mode, and * the caller MUST abort after this. */ public Map> close() throws IOException { return close(false); } private final Object closeLock = new Object(); /** Conf key for fair locking policy */ public static final String FAIR_REENTRANT_CLOSE_LOCK = "hbase.regionserver.fair.region.close.lock"; public static final boolean DEFAULT_FAIR_REENTRANT_CLOSE_LOCK = true; /** Conf key for the periodic flush interval */ public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL = "hbase.regionserver.optionalcacheflushinterval"; /** Default interval for the memstore flush */ public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000; /** Default interval for System tables memstore flush */ public static final int SYSTEM_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes /** Conf key to force a flush if there are already enough changes for one region in memstore */ public static final String MEMSTORE_FLUSH_PER_CHANGES = "hbase.regionserver.flush.per.changes"; public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions /** * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region */ public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G public static final String CLOSE_WAIT_ABORT = "hbase.regionserver.close.wait.abort"; public static final boolean DEFAULT_CLOSE_WAIT_ABORT = false; public static final String CLOSE_WAIT_TIME = "hbase.regionserver.close.wait.time.ms"; public static final long DEFAULT_CLOSE_WAIT_TIME = 60000; // 1 minute public static final String CLOSE_WAIT_INTERVAL = "hbase.regionserver.close.wait.interval.ms"; public static final long DEFAULT_CLOSE_WAIT_INTERVAL = 10000; // 10 seconds /** * Close down this HRegion. Flush the cache unless abort parameter is true, Shut down each HStore, * don't service any more calls. This method could take some time to execute, so don't call it * from a time-sensitive thread. * @param abort true if server is aborting (only during testing) * @return Vector of all the storage files that the HRegion's component HStores make use of. It's * a list of StoreFile objects. Can be null if we are not to close at this time or we are * already closed. * @throws IOException e * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was * not properly persisted. The region is put in closing mode, and * the caller MUST abort after this. */ public Map> close(boolean abort) throws IOException { // Only allow one thread to close at a time. Serialize them so dual // threads attempting to close will run up against each other. MonitoredTask status = TaskMonitor.get().createStatus( "Closing region " + this.getRegionInfo().getEncodedName() + (abort ? " due to abort" : ""), false, true); status.setStatus("Waiting for close lock"); try { synchronized (closeLock) { return doClose(abort, status); } } finally { if (LOG.isDebugEnabled()) { LOG.debug("Region close journal for {}:\n{}", this.getRegionInfo().getEncodedName(), status.prettyPrintJournal()); } status.cleanup(); } } /** * Exposed for some very specific unit tests. */ public void setClosing(boolean closing) { this.closing.set(closing); } /** * The {@link HRegion#doClose} will block forever if someone tries proving the dead lock via the * unit test. Instead of blocking, the {@link HRegion#doClose} will throw exception if you set the * timeout. * @param timeoutForWriteLock the second time to wait for the write lock in * {@link HRegion#doClose} */ public void setTimeoutForWriteLock(long timeoutForWriteLock) { assert timeoutForWriteLock >= 0; this.timeoutForWriteLock = timeoutForWriteLock; } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "UL_UNRELEASED_LOCK_EXCEPTION_PATH", justification = "I think FindBugs is confused") private Map> doClose(boolean abort, MonitoredTask status) throws IOException { if (isClosed()) { LOG.warn("Region " + this + " already closed"); return null; } if (coprocessorHost != null) { status.setStatus("Running coprocessor pre-close hooks"); this.coprocessorHost.preClose(abort); } status.setStatus("Disabling compacts and flushes for region"); boolean canFlush = true; synchronized (writestate) { // Disable compacting and flushing by background threads for this // region. canFlush = !writestate.readOnly; writestate.writesEnabled = false; LOG.debug("Closing {}, disabling compactions & flushes", this.getRegionInfo().getEncodedName()); waitForFlushesAndCompactions(); } // If we were not just flushing, is it worth doing a preflush...one // that will clear out of the bulk of the memstore before we put up // the close flag? if (!abort && worthPreFlushing() && canFlush) { status.setStatus("Pre-flushing region before close"); LOG.info("Running close preflush of {}", this.getRegionInfo().getEncodedName()); try { internalFlushcache(status); } catch (IOException ioe) { // Failed to flush the region. Keep going. status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage()); } } // Set the closing flag // From this point new arrivals at the region lock will get NSRE. this.closing.set(true); LOG.info("Closing region {}", this); // Acquire the close lock // The configuration parameter CLOSE_WAIT_ABORT is overloaded to enable both // the new regionserver abort condition and interrupts for running requests. // If CLOSE_WAIT_ABORT is not enabled there is no change from earlier behavior, // we will not attempt to interrupt threads servicing requests nor crash out // the regionserver if something remains stubborn. final boolean canAbort = conf.getBoolean(CLOSE_WAIT_ABORT, DEFAULT_CLOSE_WAIT_ABORT); boolean useTimedWait = false; if (timeoutForWriteLock != null && timeoutForWriteLock != Long.MAX_VALUE) { // convert legacy use of timeoutForWriteLock in seconds to new use in millis timeoutForWriteLock = TimeUnit.SECONDS.toMillis(timeoutForWriteLock); useTimedWait = true; } else if (canAbort) { timeoutForWriteLock = conf.getLong(CLOSE_WAIT_TIME, DEFAULT_CLOSE_WAIT_TIME); useTimedWait = true; } if (LOG.isDebugEnabled()) { LOG.debug((useTimedWait ? "Time limited wait" : "Waiting without time limit") + " for close lock on " + this); } final long closeWaitInterval = conf.getLong(CLOSE_WAIT_INTERVAL, DEFAULT_CLOSE_WAIT_INTERVAL); long elapsedWaitTime = 0; if (useTimedWait) { // Sanity check configuration long remainingWaitTime = timeoutForWriteLock; if (remainingWaitTime < closeWaitInterval) { LOG.warn("Time limit for close wait of " + timeoutForWriteLock + " ms is less than the configured lock acquisition wait interval " + closeWaitInterval + " ms, using wait interval as time limit"); remainingWaitTime = closeWaitInterval; } boolean acquired = false; do { long start = EnvironmentEdgeManager.currentTime(); try { acquired = lock.writeLock().tryLock(Math.min(remainingWaitTime, closeWaitInterval), TimeUnit.MILLISECONDS); } catch (InterruptedException e) { // Interrupted waiting for close lock. More likely the server is shutting down, not // normal operation, so aborting upon interrupt while waiting on this lock would not // provide much value. Throw an IOE (as IIOE) like we would in the case where we // fail to acquire the lock. String msg = "Interrupted while waiting for close lock on " + this; LOG.warn(msg, e); throw (InterruptedIOException) new InterruptedIOException(msg).initCause(e); } long elapsed = EnvironmentEdgeManager.currentTime() - start; elapsedWaitTime += elapsed; remainingWaitTime -= elapsed; if (canAbort && !acquired && remainingWaitTime > 0) { // Before we loop to wait again, interrupt all region operations that might // still be in progress, to encourage them to break out of waiting states or // inner loops, throw an exception to clients, and release the read lock via // endRegionOperation. if (LOG.isDebugEnabled()) { LOG.debug("Interrupting region operations after waiting for close lock for " + elapsedWaitTime + " ms on " + this + ", " + remainingWaitTime + " ms remaining"); } interruptRegionOperations(); } } while (!acquired && remainingWaitTime > 0); // If we fail to acquire the lock, trigger an abort if we can; otherwise throw an IOE // to let the caller know we could not proceed with the close. if (!acquired) { String msg = "Failed to acquire close lock on " + this + " after waiting " + elapsedWaitTime + " ms"; LOG.error(msg); if (canAbort) { // If we failed to acquire the write lock, abort the server rsServices.abort(msg, null); } throw new IOException(msg); } } else { long start = EnvironmentEdgeManager.currentTime(); lock.writeLock().lock(); elapsedWaitTime = EnvironmentEdgeManager.currentTime() - start; } if (LOG.isDebugEnabled()) { LOG.debug("Acquired close lock on " + this + " after waiting " + elapsedWaitTime + " ms"); } status.setStatus("Disabling writes for close"); try { if (this.isClosed()) { status.abort("Already got closed by another process"); // SplitTransaction handles the null return null; } LOG.debug("Updates disabled for region " + this); // Don't flush the cache if we are aborting if (!abort && canFlush) { int failedfFlushCount = 0; int flushCount = 0; long tmp = 0; long remainingSize = this.memStoreSizing.getDataSize(); while (remainingSize > 0) { try { internalFlushcache(status); if (flushCount > 0) { LOG.info("Running extra flush, " + flushCount + " (carrying snapshot?) " + this); } flushCount++; tmp = this.memStoreSizing.getDataSize(); if (tmp >= remainingSize) { failedfFlushCount++; } remainingSize = tmp; if (failedfFlushCount > 5) { // If we failed 5 times and are unable to clear memory, abort // so we do not lose data throw new DroppedSnapshotException("Failed clearing memory after " + flushCount + " attempts on region: " + Bytes.toStringBinary(getRegionInfo().getRegionName())); } } catch (IOException ioe) { status.setStatus("Failed flush " + this + ", putting online again"); synchronized (writestate) { writestate.writesEnabled = true; } // Have to throw to upper layers. I can't abort server from here. throw ioe; } } } Map> result = new TreeMap<>(Bytes.BYTES_COMPARATOR); if (!stores.isEmpty()) { // initialize the thread pool for closing stores in parallel. ThreadPoolExecutor storeCloserThreadPool = getStoreOpenAndCloseThreadPool("StoreCloser-" + getRegionInfo().getRegionNameAsString()); CompletionService>> completionService = new ExecutorCompletionService<>(storeCloserThreadPool); // close each store in parallel for (HStore store : stores.values()) { MemStoreSize mss = store.getFlushableSize(); if (!(abort || mss.getDataSize() == 0 || writestate.readOnly)) { if (getRegionServerServices() != null) { getRegionServerServices().abort("Assertion failed while closing store " + getRegionInfo().getRegionNameAsString() + " " + store + ". flushableSize expected=0, actual={" + mss + "}. Current memStoreSize=" + this.memStoreSizing.getMemStoreSize() + ". Maybe a coprocessor " + "operation failed and left the memstore in a partially updated state.", null); } } completionService.submit(new Callable>>() { @Override public Pair> call() throws IOException { return new Pair<>(store.getColumnFamilyDescriptor().getName(), store.close()); } }); } try { for (int i = 0; i < stores.size(); i++) { Future>> future = completionService.take(); Pair> storeFiles = future.get(); List familyFiles = result.get(storeFiles.getFirst()); if (familyFiles == null) { familyFiles = new ArrayList<>(); result.put(storeFiles.getFirst(), familyFiles); } familyFiles.addAll(storeFiles.getSecond()); } } catch (InterruptedException e) { throw throwOnInterrupt(e); } catch (ExecutionException e) { Throwable cause = e.getCause(); if (cause instanceof IOException) { throw (IOException) cause; } throw new IOException(cause); } finally { storeCloserThreadPool.shutdownNow(); } } status.setStatus("Writing region close event to WAL"); // Always write close marker to wal even for read only table. This is not a big problem as we // do not write any data into the region; it is just a meta edit in the WAL file. if ( !abort && wal != null && getRegionServerServices() != null && RegionReplicaUtil.isDefaultReplica(getRegionInfo()) ) { writeRegionCloseMarker(wal); } this.closed.set(true); if (!canFlush) { decrMemStoreSize(this.memStoreSizing.getMemStoreSize()); } else if (this.memStoreSizing.getDataSize() != 0) { LOG.error("Memstore data size is {} in region {}", this.memStoreSizing.getDataSize(), this); } if (coprocessorHost != null) { status.setStatus("Running coprocessor post-close hooks"); this.coprocessorHost.postClose(abort); } if (this.metricsRegion != null) { this.metricsRegion.close(); } if (this.metricsRegionWrapper != null) { Closeables.close(this.metricsRegionWrapper, true); } status.markComplete("Closed"); LOG.info("Closed {}", this); return result; } finally { lock.writeLock().unlock(); } } /** Wait for all current flushes and compactions of the region to complete */ // TODO HBASE-18906. Check the usage (if any) in Phoenix and expose this or give alternate way for // Phoenix needs. public void waitForFlushesAndCompactions() { synchronized (writestate) { if (this.writestate.readOnly) { // we should not wait for replayed flushed if we are read only (for example in case the // region is a secondary replica). return; } boolean interrupted = false; try { while (writestate.compacting.get() > 0 || writestate.flushing) { LOG.debug("waiting for " + writestate.compacting + " compactions" + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this); try { writestate.wait(); } catch (InterruptedException iex) { // essentially ignore and propagate the interrupt back up LOG.warn("Interrupted while waiting in region {}", this); interrupted = true; break; } } } finally { if (interrupted) { Thread.currentThread().interrupt(); } } } } /** * Wait for all current flushes of the region to complete */ public void waitForFlushes() { waitForFlushes(0);// Unbound wait } @Override public boolean waitForFlushes(long timeout) { synchronized (writestate) { if (this.writestate.readOnly) { // we should not wait for replayed flushed if we are read only (for example in case the // region is a secondary replica). return true; } if (!writestate.flushing) return true; long start = EnvironmentEdgeManager.currentTime(); long duration = 0; boolean interrupted = false; LOG.debug("waiting for cache flush to complete for region " + this); try { while (writestate.flushing) { if (timeout > 0 && duration >= timeout) break; try { long toWait = timeout == 0 ? 0 : (timeout - duration); writestate.wait(toWait); } catch (InterruptedException iex) { // essentially ignore and propagate the interrupt back up LOG.warn("Interrupted while waiting in region {}", this); interrupted = true; break; } finally { duration = EnvironmentEdgeManager.currentTime() - start; } } } finally { if (interrupted) { Thread.currentThread().interrupt(); } } LOG.debug("Waited {} ms for region {} flush to complete", duration, this); return !(writestate.flushing); } } @Override public Configuration getReadOnlyConfiguration() { return new ReadOnlyConfiguration(this.conf); } private ThreadPoolExecutor getStoreOpenAndCloseThreadPool(final String threadNamePrefix) { int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount()); int maxThreads = Math.min(numStores, conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX, HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)); return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix); } ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(final String threadNamePrefix) { int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount()); int maxThreads = Math.max(1, conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX, HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX) / numStores); return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix); } private static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads, final String threadNamePrefix) { return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS, new ThreadFactory() { private int count = 1; @Override public Thread newThread(Runnable r) { return new Thread(r, threadNamePrefix + "-" + count++); } }); } /** Returns True if its worth doing a flush before we put up the close flag. */ private boolean worthPreFlushing() { return this.memStoreSizing.getDataSize() > this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5); } ////////////////////////////////////////////////////////////////////////////// // HRegion accessors ////////////////////////////////////////////////////////////////////////////// @Override public TableDescriptor getTableDescriptor() { return this.htableDescriptor; } @RestrictedApi(explanation = "Should only be called in tests", link = "", allowedOnPath = ".*/src/test/.*") public void setTableDescriptor(TableDescriptor desc) { htableDescriptor = desc; } /** Returns WAL in use for this region */ public WAL getWAL() { return this.wal; } public BlockCache getBlockCache() { return this.blockCache; } /** * Only used for unit test which doesn't start region server. */ public void setBlockCache(BlockCache blockCache) { this.blockCache = blockCache; } public MobFileCache getMobFileCache() { return this.mobFileCache; } /** * Only used for unit test which doesn't start region server. */ public void setMobFileCache(MobFileCache mobFileCache) { this.mobFileCache = mobFileCache; } /** Returns split policy for this region. */ RegionSplitPolicy getSplitPolicy() { return this.splitPolicy; } /** * A split takes the config from the parent region & passes it to the daughter region's * constructor. If 'conf' was passed, you would end up using the HTD of the parent region in * addition to the new daughter HTD. Pass 'baseConf' to the daughter regions to avoid this tricky * dedupe problem. * @return Configuration object */ Configuration getBaseConf() { return this.baseConf; } /** Returns {@link FileSystem} being used by this region */ public FileSystem getFilesystem() { return fs.getFileSystem(); } /** Returns the {@link HRegionFileSystem} used by this region */ public HRegionFileSystem getRegionFileSystem() { return this.fs; } /** Returns the WAL {@link HRegionFileSystem} used by this region */ HRegionWALFileSystem getRegionWALFileSystem() throws IOException { return new HRegionWALFileSystem(conf, getWalFileSystem(), CommonFSUtils.getWALTableDir(conf, htableDescriptor.getTableName()), fs.getRegionInfo()); } /** Returns the WAL {@link FileSystem} being used by this region */ FileSystem getWalFileSystem() throws IOException { if (walFS == null) { walFS = CommonFSUtils.getWALFileSystem(conf); } return walFS; } /** * @return the Region directory under WALRootDirectory * @throws IOException if there is an error getting WALRootDir */ public Path getWALRegionDir() throws IOException { if (regionWalDir == null) { regionWalDir = CommonFSUtils.getWALRegionDir(conf, getRegionInfo().getTable(), getRegionInfo().getEncodedName()); } return regionWalDir; } @Override public long getEarliestFlushTimeForAllStores() { return Collections.min(lastStoreFlushTimeMap.values()); } @Override public long getOldestHfileTs(boolean majorCompactionOnly) throws IOException { long result = Long.MAX_VALUE; for (HStore store : stores.values()) { Collection storeFiles = store.getStorefiles(); if (storeFiles == null) { continue; } for (HStoreFile file : storeFiles) { StoreFileReader sfReader = file.getReader(); if (sfReader == null) { continue; } HFile.Reader reader = sfReader.getHFileReader(); if (reader == null) { continue; } if (majorCompactionOnly) { byte[] val = reader.getHFileInfo().get(MAJOR_COMPACTION_KEY); if (val == null || !Bytes.toBoolean(val)) { continue; } } result = Math.min(result, reader.getFileContext().getFileCreateTime()); } } return result == Long.MAX_VALUE ? 0 : result; } RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) { long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId; byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes(); regionLoadBldr.clearStoreCompleteSequenceId(); for (byte[] familyName : this.stores.keySet()) { long earliest = this.wal.getEarliestMemStoreSeqNum(encodedRegionName, familyName); // Subtract - 1 to go earlier than the current oldest, unflushed edit in memstore; this will // give us a sequence id that is for sure flushed. We want edit replay to start after this // sequence id in this region. If NO_SEQNUM, use the regions maximum flush id. long csid = (earliest == HConstants.NO_SEQNUM) ? lastFlushOpSeqIdLocal : earliest - 1; regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId.newBuilder() .setFamilyName(UnsafeByteOperations.unsafeWrap(familyName)).setSequenceId(csid).build()); } return regionLoadBldr.setCompleteSequenceId(getMaxFlushedSeqId()); } ////////////////////////////////////////////////////////////////////////////// // HRegion maintenance. // // These methods are meant to be called periodically by the HRegionServer for // upkeep. ////////////////////////////////////////////////////////////////////////////// /** * Do preparation for pending compaction. */ protected void doRegionCompactionPrep() throws IOException { } /** * Synchronously compact all stores in the region. *

* This operation could block for a long time, so don't call it from a time-sensitive thread. *

* Note that no locks are taken to prevent possible conflicts between compaction and splitting * activities. The regionserver does not normally compact and split in parallel. However by * calling this method you may introduce unexpected and unhandled concurrency. Don't do this * unless you know what you are doing. * @param majorCompaction True to force a major compaction regardless of thresholds */ public void compact(boolean majorCompaction) throws IOException { if (majorCompaction) { stores.values().forEach(HStore::triggerMajorCompaction); } for (HStore s : stores.values()) { Optional compaction = s.requestCompaction(); if (compaction.isPresent()) { ThroughputController controller = null; if (rsServices != null) { controller = CompactionThroughputControllerFactory.create(rsServices, conf); } if (controller == null) { controller = NoLimitThroughputController.INSTANCE; } compact(compaction.get(), s, controller, null); } } } /** * This is a helper function that compact all the stores synchronously. *

* It is used by utilities and testing */ public void compactStores() throws IOException { for (HStore s : stores.values()) { Optional compaction = s.requestCompaction(); if (compaction.isPresent()) { compact(compaction.get(), s, NoLimitThroughputController.INSTANCE, null); } } } /** * This is a helper function that compact the given store. *

* It is used by utilities and testing */ void compactStore(byte[] family, ThroughputController throughputController) throws IOException { HStore s = getStore(family); Optional compaction = s.requestCompaction(); if (compaction.isPresent()) { compact(compaction.get(), s, throughputController, null); } } /** * Called by compaction thread and after region is opened to compact the HStores if necessary. *

* This operation could block for a long time, so don't call it from a time-sensitive thread. Note * that no locking is necessary at this level because compaction only conflicts with a region * split, and that cannot happen because the region server does them sequentially and not in * parallel. * @param compaction Compaction details, obtained by requestCompaction() * @return whether the compaction completed */ public boolean compact(CompactionContext compaction, HStore store, ThroughputController throughputController) throws IOException { return compact(compaction, store, throughputController, null); } public boolean compact(CompactionContext compaction, HStore store, ThroughputController throughputController, User user) throws IOException { assert compaction != null && compaction.hasSelection(); assert !compaction.getRequest().getFiles().isEmpty(); if (this.closing.get() || this.closed.get()) { LOG.debug("Skipping compaction on " + this + " because closing/closed"); store.cancelRequestedCompaction(compaction); return false; } MonitoredTask status = null; boolean requestNeedsCancellation = true; /* * We are trying to remove / relax the region read lock for compaction. Let's see what are the * potential race conditions among the operations (user scan, region split, region close and * region bulk load). user scan ---> region read lock region split --> region close first --> * region write lock region close --> region write lock region bulk load --> region write lock * read lock is compatible with read lock. ---> no problem with user scan/read region bulk load * does not cause problem for compaction (no consistency problem, store lock will help the store * file accounting). They can run almost concurrently at the region level. The only remaining * race condition is between the region close and compaction. So we will evaluate, below, how * region close intervenes with compaction if compaction does not acquire region read lock. Here * are the steps for compaction: 1. obtain list of StoreFile's 2. create StoreFileScanner's * based on list from #1 3. perform compaction and save resulting files under tmp dir 4. swap in * compacted files #1 is guarded by store lock. This patch does not change this --> no worse or * better For #2, we obtain smallest read point (for region) across all the Scanners (for both * default compactor and stripe compactor). The read points are for user scans. Region keeps the * read points for all currently open user scanners. Compaction needs to know the smallest read * point so that during re-write of the hfiles, it can remove the mvcc points for the cells if * their mvccs are older than the smallest since they are not needed anymore. This will not * conflict with compaction. For #3, it can be performed in parallel to other operations. For #4 * bulk load and compaction don't conflict with each other on the region level (for multi-family * atomicy). Region close and compaction are guarded pretty well by the 'writestate'. In * HRegion#doClose(), we have : synchronized (writestate) { // Disable compacting and flushing * by background threads for this // region. canFlush = !writestate.readOnly; * writestate.writesEnabled = false; LOG.debug("Closing " + this + * ": disabling compactions & flushes"); waitForFlushesAndCompactions(); } * waitForFlushesAndCompactions() would wait for writestate.compacting to come down to 0. and in * HRegion.compact() try { synchronized (writestate) { if (writestate.writesEnabled) { * wasStateSet = true; ++writestate.compacting; } else { String msg = "NOT compacting region " + * this + ". Writes disabled."; LOG.info(msg); status.abort(msg); return false; } } Also in * compactor.performCompaction(): check periodically to see if a system stop is requested if * (closeChecker != null && closeChecker.isTimeLimit(store, now)) { progress.cancel(); return * false; } if (closeChecker != null && closeChecker.isSizeLimit(store, len)) { * progress.cancel(); return false; } */ try { byte[] cf = Bytes.toBytes(store.getColumnFamilyName()); if (stores.get(cf) != store) { LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this + " has been re-instantiated, cancel this compaction request. " + " It may be caused by the roll back of split transaction"); return false; } status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this); if (this.closed.get()) { String msg = "Skipping compaction on " + this + " because closed"; LOG.debug(msg); status.abort(msg); return false; } boolean wasStateSet = false; try { synchronized (writestate) { if (writestate.writesEnabled) { wasStateSet = true; writestate.compacting.incrementAndGet(); } else { String msg = "NOT compacting region " + this + ". Writes disabled."; LOG.info(msg); status.abort(msg); return false; } } LOG.info("Starting compaction of {} in {}{}", store, this, (compaction.getRequest().isOffPeak() ? " as an off-peak compaction" : "")); doRegionCompactionPrep(); try { status.setStatus("Compacting store " + store); // We no longer need to cancel the request on the way out of this // method because Store#compact will clean up unconditionally requestNeedsCancellation = false; store.compact(compaction, throughputController, user); } catch (InterruptedIOException iioe) { String msg = "region " + this + " compaction interrupted"; LOG.info(msg, iioe); status.abort(msg); return false; } } finally { if (wasStateSet) { synchronized (writestate) { writestate.compacting.decrementAndGet(); if (writestate.compacting.get() <= 0) { writestate.notifyAll(); } } } } status.markComplete("Compaction complete"); return true; } finally { if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction); if (status != null) { LOG.debug("Compaction status journal for {}:\n{}", this.getRegionInfo().getEncodedName(), status.prettyPrintJournal()); status.cleanup(); } } } /** * Flush the cache. *

* When this method is called the cache will be flushed unless: *

    *
  1. the cache is empty
  2. *
  3. the region is closed.
  4. *
  5. a flush is already in progress
  6. *
  7. writes are disabled
  8. *
*

* This method may block for some time, so it should not be called from a time-sensitive thread. * @param flushAllStores whether we want to force a flush of all stores * @return FlushResult indicating whether the flush was successful or not and if the region needs * compacting * @throws IOException general io exceptions because a snapshot was not properly persisted. */ // TODO HBASE-18905. We might have to expose a requestFlush API for CPs public FlushResult flush(boolean flushAllStores) throws IOException { return flushcache(flushAllStores, false, FlushLifeCycleTracker.DUMMY); } public interface FlushResult { enum Result { FLUSHED_NO_COMPACTION_NEEDED, FLUSHED_COMPACTION_NEEDED, // Special case where a flush didn't run because there's nothing in the memstores. Used when // bulk loading to know when we can still load even if a flush didn't happen. CANNOT_FLUSH_MEMSTORE_EMPTY, CANNOT_FLUSH } /** Returns the detailed result code */ Result getResult(); /** Returns true if the memstores were flushed, else false */ boolean isFlushSucceeded(); /** Returns True if the flush requested a compaction, else false */ boolean isCompactionNeeded(); } FlushResultImpl flushcache(boolean flushAllStores, boolean writeFlushRequestWalMarker, FlushLifeCycleTracker tracker) throws IOException { List families = null; if (flushAllStores) { families = new ArrayList<>(); families.addAll(this.getTableDescriptor().getColumnFamilyNames()); } return this.flushcache(families, writeFlushRequestWalMarker, tracker); } /** * Flush the cache. When this method is called the cache will be flushed unless: *

    *
  1. the cache is empty
  2. *
  3. the region is closed.
  4. *
  5. a flush is already in progress
  6. *
  7. writes are disabled
  8. *
*

* This method may block for some time, so it should not be called from a time-sensitive thread. * @param families stores of region to flush. * @param writeFlushRequestWalMarker whether to write the flush request marker to WAL * @param tracker used to track the life cycle of this flush * @return whether the flush is success and whether the region needs compacting * @throws IOException general io exceptions * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was * not properly persisted. The region is put in closing mode, and * the caller MUST abort after this. */ public FlushResultImpl flushcache(List families, boolean writeFlushRequestWalMarker, FlushLifeCycleTracker tracker) throws IOException { // fail-fast instead of waiting on the lock if (this.closing.get()) { String msg = "Skipping flush on " + this + " because closing"; LOG.debug(msg); return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); } MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this); status.setStatus("Acquiring readlock on region"); // block waiting for the lock for flushing cache lock.readLock().lock(); boolean flushed = true; try { if (this.closed.get()) { String msg = "Skipping flush on " + this + " because closed"; LOG.debug(msg); status.abort(msg); flushed = false; return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); } if (coprocessorHost != null) { status.setStatus("Running coprocessor pre-flush hooks"); coprocessorHost.preFlush(tracker); } // TODO: this should be managed within memstore with the snapshot, updated only after flush // successful if (numMutationsWithoutWAL.sum() > 0) { numMutationsWithoutWAL.reset(); dataInMemoryWithoutWAL.reset(); } synchronized (writestate) { if (!writestate.flushing && writestate.writesEnabled) { this.writestate.flushing = true; } else { String msg = "NOT flushing " + this + " as " + (writestate.flushing ? "already flushing" : "writes are not enabled"); LOG.debug(msg); status.abort(msg); flushed = false; return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); } } try { // The reason that we do not always use flushPolicy is, when the flush is // caused by logRoller, we should select stores which must be flushed // rather than could be flushed. Collection specificStoresToFlush = null; if (families != null) { specificStoresToFlush = getSpecificStores(families); } else { specificStoresToFlush = flushPolicy.selectStoresToFlush(); } FlushResultImpl fs = internalFlushcache(specificStoresToFlush, status, writeFlushRequestWalMarker, tracker); if (coprocessorHost != null) { status.setStatus("Running post-flush coprocessor hooks"); coprocessorHost.postFlush(tracker); } if (fs.isFlushSucceeded()) { flushesQueued.reset(); } status.markComplete("Flush successful " + fs.toString()); return fs; } finally { synchronized (writestate) { writestate.flushing = false; this.writestate.flushRequested = false; writestate.notifyAll(); } } } finally { lock.readLock().unlock(); if (flushed) { // Don't log this journal stuff if no flush -- confusing. LOG.debug("Flush status journal for {}:\n{}", this.getRegionInfo().getEncodedName(), status.prettyPrintJournal()); } status.cleanup(); } } /** * get stores which matches the specified families * @return the stores need to be flushed. */ private Collection getSpecificStores(List families) { Collection specificStoresToFlush = new ArrayList<>(); for (byte[] family : families) { specificStoresToFlush.add(stores.get(family)); } return specificStoresToFlush; } /** * Should the store be flushed because it is old enough. *

* Every FlushPolicy should call this to determine whether a store is old enough to flush (except * that you always flush all stores). Otherwise the method will always returns true which will * make a lot of flush requests. */ boolean shouldFlushStore(HStore store) { long earliest = this.wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), store.getColumnFamilyDescriptor().getName()) - 1; if (earliest > 0 && earliest + flushPerChanges < mvcc.getReadPoint()) { if (LOG.isDebugEnabled()) { LOG.debug("Flush column family " + store.getColumnFamilyName() + " of " + getRegionInfo().getEncodedName() + " because unflushed sequenceid=" + earliest + " is > " + this.flushPerChanges + " from current=" + mvcc.getReadPoint()); } return true; } if (this.flushCheckInterval <= 0) { return false; } long now = EnvironmentEdgeManager.currentTime(); if (store.timeOfOldestEdit() < now - this.flushCheckInterval) { if (LOG.isDebugEnabled()) { LOG.debug("Flush column family: " + store.getColumnFamilyName() + " of " + getRegionInfo().getEncodedName() + " because time of oldest edit=" + store.timeOfOldestEdit() + " is > " + this.flushCheckInterval + " from now =" + now); } return true; } return false; } /** * Should the memstore be flushed now */ boolean shouldFlush(final StringBuilder whyFlush) { whyFlush.setLength(0); // This is a rough measure. if ( this.maxFlushedSeqId > 0 && (this.maxFlushedSeqId + this.flushPerChanges < this.mvcc.getReadPoint()) ) { whyFlush.append("more than max edits, " + this.flushPerChanges + ", since last flush"); return true; } long modifiedFlushCheckInterval = flushCheckInterval; if ( getRegionInfo().getTable().isSystemTable() && getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID ) { modifiedFlushCheckInterval = SYSTEM_CACHE_FLUSH_INTERVAL; } if (modifiedFlushCheckInterval <= 0) { // disabled return false; } long now = EnvironmentEdgeManager.currentTime(); // if we flushed in the recent past, we don't need to do again now if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) { return false; } // since we didn't flush in the recent past, flush now if certain conditions // are met. Return true on first such memstore hit. for (HStore s : stores.values()) { if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) { // we have an old enough edit in the memstore, flush whyFlush.append(s.toString() + " has an old edit so flush to free WALs"); return true; } } return false; } /** * Flushing all stores. * @see #internalFlushcache(Collection, MonitoredTask, boolean, FlushLifeCycleTracker) */ private FlushResult internalFlushcache(MonitoredTask status) throws IOException { return internalFlushcache(stores.values(), status, false, FlushLifeCycleTracker.DUMMY); } /** * Flushing given stores. * @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean, FlushLifeCycleTracker) */ private FlushResultImpl internalFlushcache(Collection storesToFlush, MonitoredTask status, boolean writeFlushWalMarker, FlushLifeCycleTracker tracker) throws IOException { return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush, status, writeFlushWalMarker, tracker); } /** * Flush the memstore. Flushing the memstore is a little tricky. We have a lot of updates in the * memstore, all of which have also been written to the wal. We need to write those updates in the * memstore out to disk, while being able to process reads/writes as much as possible during the * flush operation. *

* This method may block for some time. Every time you call it, we up the regions sequence id even * if we don't flush; i.e. the returned region id will be at least one larger than the last edit * applied to this region. The returned id does not refer to an actual edit. The returned id can * be used for say installing a bulk loaded file just ahead of the last hfile that was the result * of this flush, etc. * @param wal Null if we're NOT to go via wal. * @param myseqid The seqid to use if wal is null writing out flush file. * @param storesToFlush The list of stores to flush. * @return object describing the flush's state * @throws IOException general io exceptions * @throws DroppedSnapshotException Thrown when replay of WAL is required. */ protected FlushResultImpl internalFlushcache(WAL wal, long myseqid, Collection storesToFlush, MonitoredTask status, boolean writeFlushWalMarker, FlushLifeCycleTracker tracker) throws IOException { PrepareFlushResult result = internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker, tracker); if (result.result == null) { return internalFlushCacheAndCommit(wal, status, result, storesToFlush); } else { return result.result; // early exit due to failure from prepare stage } } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "DLS_DEAD_LOCAL_STORE", justification = "FindBugs seems confused about trxId") protected PrepareFlushResult internalPrepareFlushCache(WAL wal, long myseqid, Collection storesToFlush, MonitoredTask status, boolean writeFlushWalMarker, FlushLifeCycleTracker tracker) throws IOException { if (this.rsServices != null && this.rsServices.isAborted()) { // Don't flush when server aborting, it's unsafe throw new IOException("Aborting flush because server is aborted..."); } final long startTime = EnvironmentEdgeManager.currentTime(); // If nothing to flush, return, but return with a valid unused sequenceId. // Its needed by bulk upload IIRC. It flushes until no edits in memory so it can insert a // bulk loaded file between memory and existing hfiles. It wants a good seqeunceId that belongs // to no other that it can use to associate with the bulk load. Hence this little dance below // to go get one. if (this.memStoreSizing.getDataSize() <= 0) { // Take an update lock so no edits can come into memory just yet. this.updatesLock.writeLock().lock(); WriteEntry writeEntry = null; try { if (this.memStoreSizing.getDataSize() <= 0) { // Presume that if there are still no edits in the memstore, then there are no edits for // this region out in the WAL subsystem so no need to do any trickery clearing out // edits in the WAL sub-system. Up the sequence number so the resulting flush id is for // sure just beyond the last appended region edit and not associated with any edit // (useful as marker when bulk loading, etc.). if (wal != null) { writeEntry = mvcc.begin(); long flushOpSeqId = writeEntry.getWriteNumber(); FlushResultImpl flushResult = new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, flushOpSeqId, "Nothing to flush", writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker)); mvcc.completeAndWait(writeEntry); // Set to null so we don't complete it again down in finally block. writeEntry = null; return new PrepareFlushResult(flushResult, myseqid); } else { return new PrepareFlushResult(new FlushResultImpl( FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush", false), myseqid); } } } finally { if (writeEntry != null) { // If writeEntry is non-null, this operation failed; the mvcc transaction failed... // but complete it anyways so it doesn't block the mvcc queue. mvcc.complete(writeEntry); } this.updatesLock.writeLock().unlock(); } } logFatLineOnFlush(storesToFlush, myseqid); // Stop updates while we snapshot the memstore of all of these regions' stores. We only have // to do this for a moment. It is quick. We also set the memstore size to zero here before we // allow updates again so its value will represent the size of the updates received // during flush // We have to take an update lock during snapshot, or else a write could end up in both snapshot // and memstore (makes it difficult to do atomic rows then) status.setStatus("Obtaining lock to block concurrent updates"); // block waiting for the lock for internal flush this.updatesLock.writeLock().lock(); status.setStatus("Preparing flush snapshotting stores in " + getRegionInfo().getEncodedName()); MemStoreSizing totalSizeOfFlushableStores = new NonThreadSafeMemStoreSizing(); Map flushedFamilyNamesToSeq = new HashMap<>(); for (HStore store : storesToFlush) { flushedFamilyNamesToSeq.put(store.getColumnFamilyDescriptor().getName(), store.preFlushSeqIDEstimation()); } TreeMap storeFlushCtxs = new TreeMap<>(Bytes.BYTES_COMPARATOR); TreeMap> committedFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); TreeMap storeFlushableSize = new TreeMap<>(Bytes.BYTES_COMPARATOR); // The sequence id of this flush operation which is used to log FlushMarker and pass to // createFlushContext to use as the store file's sequence id. It can be in advance of edits // still in the memstore, edits that are in other column families yet to be flushed. long flushOpSeqId = HConstants.NO_SEQNUM; // The max flushed sequence id after this flush operation completes. All edits in memstore // will be in advance of this sequence id. long flushedSeqId = HConstants.NO_SEQNUM; byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes(); try { if (wal != null) { Long earliestUnflushedSequenceIdForTheRegion = wal.startCacheFlush(encodedRegionName, flushedFamilyNamesToSeq); if (earliestUnflushedSequenceIdForTheRegion == null) { // This should never happen. This is how startCacheFlush signals flush cannot proceed. String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing."; status.setStatus(msg); return new PrepareFlushResult( new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false), myseqid); } flushOpSeqId = getNextSequenceId(wal); // Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit flushedSeqId = earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM ? flushOpSeqId : earliestUnflushedSequenceIdForTheRegion.longValue() - 1; } else { // use the provided sequence Id as WAL is not being used for this flush. flushedSeqId = flushOpSeqId = myseqid; } for (HStore s : storesToFlush) { storeFlushCtxs.put(s.getColumnFamilyDescriptor().getName(), s.createFlushContext(flushOpSeqId, tracker)); // for writing stores to WAL committedFiles.put(s.getColumnFamilyDescriptor().getName(), null); } // write the snapshot start to WAL if (wal != null && !writestate.readOnly) { FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH, getRegionInfo(), flushOpSeqId, committedFiles); // No sync. Sync is below where no updates lock and we do FlushAction.COMMIT_FLUSH WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false, mvcc); } // Prepare flush (take a snapshot) storeFlushCtxs.forEach((name, flush) -> { MemStoreSize snapshotSize = flush.prepare(); totalSizeOfFlushableStores.incMemStoreSize(snapshotSize); storeFlushableSize.put(name, snapshotSize); }); } catch (IOException ex) { doAbortFlushToWAL(wal, flushOpSeqId, committedFiles); throw ex; } finally { this.updatesLock.writeLock().unlock(); } String s = "Finished memstore snapshotting " + this + ", syncing WAL and waiting on mvcc, " + "flushsize=" + totalSizeOfFlushableStores; status.setStatus(s); doSyncOfUnflushedWALChanges(wal, getRegionInfo()); return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime, flushOpSeqId, flushedSeqId, totalSizeOfFlushableStores); } /** * Utility method broken out of internalPrepareFlushCache so that method is smaller. */ private void logFatLineOnFlush(Collection storesToFlush, long sequenceId) { if (!LOG.isInfoEnabled()) { return; } // Log a fat line detailing what is being flushed. StringBuilder perCfExtras = null; if (!isAllFamilies(storesToFlush)) { perCfExtras = new StringBuilder(); for (HStore store : storesToFlush) { MemStoreSize mss = store.getFlushableSize(); perCfExtras.append("; ").append(store.getColumnFamilyName()); perCfExtras.append("={dataSize=").append(StringUtils.byteDesc(mss.getDataSize())); perCfExtras.append(", heapSize=").append(StringUtils.byteDesc(mss.getHeapSize())); perCfExtras.append(", offHeapSize=").append(StringUtils.byteDesc(mss.getOffHeapSize())); perCfExtras.append("}"); } } MemStoreSize mss = this.memStoreSizing.getMemStoreSize(); LOG.info("Flushing " + this.getRegionInfo().getEncodedName() + " " + storesToFlush.size() + "/" + stores.size() + " column families," + " dataSize=" + StringUtils.byteDesc(mss.getDataSize()) + " heapSize=" + StringUtils.byteDesc(mss.getHeapSize()) + ((perCfExtras != null && perCfExtras.length() > 0) ? perCfExtras.toString() : "") + ((wal != null) ? "" : "; WAL is null, using passed sequenceid=" + sequenceId)); } private void doAbortFlushToWAL(final WAL wal, final long flushOpSeqId, final Map> committedFiles) { if (wal == null) return; try { FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH, getRegionInfo(), flushOpSeqId, committedFiles); WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false, mvcc); } catch (Throwable t) { LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL: {} in " + " region {}", StringUtils.stringifyException(t), this); // ignore this since we will be aborting the RS with DSE. } // we have called wal.startCacheFlush(), now we have to abort it wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); } /** * Sync unflushed WAL changes. See HBASE-8208 for details */ private static void doSyncOfUnflushedWALChanges(final WAL wal, final RegionInfo hri) throws IOException { if (wal == null) { return; } try { wal.sync(); // ensure that flush marker is sync'ed } catch (IOException ioe) { wal.abortCacheFlush(hri.getEncodedNameAsBytes()); throw ioe; } } /** Returns True if passed Set is all families in the region. */ private boolean isAllFamilies(Collection families) { return families == null || this.stores.size() == families.size(); } /** * Writes a marker to WAL indicating a flush is requested but cannot be complete due to various * reasons. Ignores exceptions from WAL. Returns whether the write succeeded. * @return whether WAL write was successful */ private boolean writeFlushRequestMarkerToWAL(WAL wal, boolean writeFlushWalMarker) { if (writeFlushWalMarker && wal != null && !writestate.readOnly) { FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH, getRegionInfo(), -1, new TreeMap<>(Bytes.BYTES_COMPARATOR)); try { WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true, mvcc); return true; } catch (IOException e) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received exception while trying to write the flush request to wal", e); } } return false; } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", justification = "Intentional; notify is about completed flush") FlushResultImpl internalFlushCacheAndCommit(WAL wal, MonitoredTask status, PrepareFlushResult prepareResult, Collection storesToFlush) throws IOException { // prepare flush context is carried via PrepareFlushResult TreeMap storeFlushCtxs = prepareResult.storeFlushCtxs; TreeMap> committedFiles = prepareResult.committedFiles; long startTime = prepareResult.startTime; long flushOpSeqId = prepareResult.flushOpSeqId; long flushedSeqId = prepareResult.flushedSeqId; String s = "Flushing stores of " + this; status.setStatus(s); if (LOG.isTraceEnabled()) LOG.trace(s); // Any failure from here on out will be catastrophic requiring server // restart so wal content can be replayed and put back into the memstore. // Otherwise, the snapshot content while backed up in the wal, it will not // be part of the current running servers state. boolean compactionRequested = false; long flushedOutputFileSize = 0; try { // A. Flush memstore to all the HStores. // Keep running vector of all store files that includes both old and the // just-made new flush store file. The new flushed file is still in the // tmp directory. for (StoreFlushContext flush : storeFlushCtxs.values()) { flush.flushCache(status); } // Switch snapshot (in memstore) -> new hfile (thus causing // all the store scanners to reset/reseek). for (Map.Entry flushEntry : storeFlushCtxs.entrySet()) { StoreFlushContext sfc = flushEntry.getValue(); boolean needsCompaction = sfc.commit(status); if (needsCompaction) { compactionRequested = true; } byte[] storeName = flushEntry.getKey(); List storeCommittedFiles = sfc.getCommittedFiles(); committedFiles.put(storeName, storeCommittedFiles); // Flush committed no files, indicating flush is empty or flush was canceled if (storeCommittedFiles == null || storeCommittedFiles.isEmpty()) { MemStoreSize storeFlushableSize = prepareResult.storeFlushableSize.get(storeName); prepareResult.totalFlushableSize.decMemStoreSize(storeFlushableSize); } flushedOutputFileSize += sfc.getOutputFileSize(); } storeFlushCtxs.clear(); // Set down the memstore size by amount of flush. MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize(); this.decrMemStoreSize(mss); // Increase the size of this Region for the purposes of quota. Noop if quotas are disabled. // During startup, quota manager may not be initialized yet. if (rsServices != null) { RegionServerSpaceQuotaManager quotaManager = rsServices.getRegionServerSpaceQuotaManager(); if (quotaManager != null) { quotaManager.getRegionSizeStore().incrementRegionSize(this.getRegionInfo(), flushedOutputFileSize); } } if (wal != null) { // write flush marker to WAL. If fail, we should throw DroppedSnapshotException FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH, getRegionInfo(), flushOpSeqId, committedFiles); WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true, mvcc); } } catch (Throwable t) { // An exception here means that the snapshot was not persisted. // The wal needs to be replayed so its content is restored to memstore. // Currently, only a server restart will do this. // We used to only catch IOEs but its possible that we'd get other // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch // all and sundry. if (wal != null) { try { FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH, getRegionInfo(), flushOpSeqId, committedFiles); WALUtil.writeFlushMarker(wal, this.replicationScope, getRegionInfo(), desc, false, mvcc); } catch (Throwable ex) { LOG.warn( getRegionInfo().getEncodedName() + " : " + "failed writing ABORT_FLUSH marker to WAL", ex); // ignore this since we will be aborting the RS with DSE. } wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); } DroppedSnapshotException dse = new DroppedSnapshotException( "region: " + Bytes.toStringBinary(getRegionInfo().getRegionName()), t); status.abort("Flush failed: " + StringUtils.stringifyException(t)); // Callers for flushcache() should catch DroppedSnapshotException and abort the region server. // However, since we may have the region read lock, we cannot call close(true) here since // we cannot promote to a write lock. Instead we are setting closing so that all other region // operations except for close will be rejected. this.closing.set(true); if (rsServices != null) { // This is a safeguard against the case where the caller fails to explicitly handle aborting rsServices.abort("Replay of WAL required. Forcing server shutdown", dse); } throw dse; } // If we get to here, the HStores have been written. if (wal != null) { wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes(), flushedSeqId); } // Record latest flush time for (HStore store : storesToFlush) { this.lastStoreFlushTimeMap.put(store, startTime); } this.maxFlushedSeqId = flushedSeqId; this.lastFlushOpSeqId = flushOpSeqId; // C. Finally notify anyone waiting on memstore to clear: // e.g. checkResources(). synchronized (this) { notifyAll(); // FindBugs NN_NAKED_NOTIFY } long time = EnvironmentEdgeManager.currentTime() - startTime; MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize(); long memstoresize = this.memStoreSizing.getMemStoreSize().getDataSize(); String msg = "Finished flush of" + " dataSize ~" + StringUtils.byteDesc(mss.getDataSize()) + "/" + mss.getDataSize() + ", heapSize ~" + StringUtils.byteDesc(mss.getHeapSize()) + "/" + mss.getHeapSize() + ", currentSize=" + StringUtils.byteDesc(memstoresize) + "/" + memstoresize + " for " + this.getRegionInfo().getEncodedName() + " in " + time + "ms, sequenceid=" + flushOpSeqId + ", compaction requested=" + compactionRequested + ((wal == null) ? "; wal=null" : ""); LOG.info(msg); status.setStatus(msg); if (rsServices != null && rsServices.getMetrics() != null) { rsServices.getMetrics().updateFlush(getTableDescriptor().getTableName().getNameAsString(), time, mss.getDataSize(), flushedOutputFileSize); } return new FlushResultImpl(compactionRequested ? FlushResult.Result.FLUSHED_COMPACTION_NEEDED : FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushOpSeqId); } /** * Method to safely get the next sequence number. * @return Next sequence number unassociated with any actual edit. */ protected long getNextSequenceId(final WAL wal) throws IOException { WriteEntry we = mvcc.begin(); mvcc.completeAndWait(we); return we.getWriteNumber(); } ////////////////////////////////////////////////////////////////////////////// // get() methods for client use. ////////////////////////////////////////////////////////////////////////////// @Override public RegionScannerImpl getScanner(Scan scan) throws IOException { return getScanner(scan, null); } @Override public RegionScannerImpl getScanner(Scan scan, List additionalScanners) throws IOException { return getScanner(scan, additionalScanners, HConstants.NO_NONCE, HConstants.NO_NONCE); } private RegionScannerImpl getScanner(Scan scan, List additionalScanners, long nonceGroup, long nonce) throws IOException { return TraceUtil.trace(() -> { startRegionOperation(Operation.SCAN); try { // Verify families are all valid if (!scan.hasFamilies()) { // Adding all families to scanner for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { scan.addFamily(family); } } else { for (byte[] family : scan.getFamilyMap().keySet()) { checkFamily(family); } } return instantiateRegionScanner(scan, additionalScanners, nonceGroup, nonce); } finally { closeRegionOperation(Operation.SCAN); } }, () -> createRegionSpan("Region.getScanner")); } protected RegionScannerImpl instantiateRegionScanner(Scan scan, List additionalScanners, long nonceGroup, long nonce) throws IOException { if (scan.isReversed()) { if (scan.getFilter() != null) { scan.getFilter().setReversed(true); } return new ReversedRegionScannerImpl(scan, additionalScanners, this, nonceGroup, nonce); } return new RegionScannerImpl(scan, additionalScanners, this, nonceGroup, nonce); } /** * Prepare a delete for a row mutation processor * @param delete The passed delete is modified by this method. WARNING! */ private void prepareDelete(Delete delete) throws IOException { // Check to see if this is a deleteRow insert if (delete.getFamilyCellMap().isEmpty()) { for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { // Don't eat the timestamp delete.addFamily(family, delete.getTimestamp()); } } else { for (byte[] family : delete.getFamilyCellMap().keySet()) { if (family == null) { throw new NoSuchColumnFamilyException("Empty family is invalid"); } checkFamily(family); } } } @Override public void delete(Delete delete) throws IOException { TraceUtil.trace(() -> { checkReadOnly(); checkResources(); startRegionOperation(Operation.DELETE); try { // All edits for the given row (across all column families) must happen atomically. return mutate(delete); } finally { closeRegionOperation(Operation.DELETE); } }, () -> createRegionSpan("Region.delete")); } /** * Set up correct timestamps in the KVs in Delete object. *

* Caller should have the row and region locks. */ private void prepareDeleteTimestamps(Mutation mutation, Map> familyMap, byte[] byteNow) throws IOException { for (Map.Entry> e : familyMap.entrySet()) { byte[] family = e.getKey(); List cells = e.getValue(); assert cells instanceof RandomAccess; Map kvCount = new TreeMap<>(Bytes.BYTES_COMPARATOR); int listSize = cells.size(); for (int i = 0; i < listSize; i++) { Cell cell = cells.get(i); // Check if time is LATEST, change to time of most recent addition if so // This is expensive. if ( cell.getTimestamp() == HConstants.LATEST_TIMESTAMP && PrivateCellUtil.isDeleteType(cell) ) { byte[] qual = CellUtil.cloneQualifier(cell); Integer count = kvCount.get(qual); if (count == null) { kvCount.put(qual, 1); } else { kvCount.put(qual, count + 1); } count = kvCount.get(qual); Get get = new Get(CellUtil.cloneRow(cell)); get.setMaxVersions(count); get.addColumn(family, qual); if (coprocessorHost != null) { if ( !coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell, byteNow, get) ) { updateDeleteLatestVersionTimestamp(cell, get, count, byteNow); } } else { updateDeleteLatestVersionTimestamp(cell, get, count, byteNow); } } else { PrivateCellUtil.updateLatestStamp(cell, byteNow); } } } } private void updateDeleteLatestVersionTimestamp(Cell cell, Get get, int count, byte[] byteNow) throws IOException { try (RegionScanner scanner = getScanner(new Scan(get))) { // NOTE: Please don't use HRegion.get() instead, // because it will copy cells to heap. See HBASE-26036 List result = new ArrayList<>(); scanner.next(result); if (result.size() < count) { // Nothing to delete PrivateCellUtil.updateLatestStamp(cell, byteNow); return; } if (result.size() > count) { throw new RuntimeException("Unexpected size: " + result.size()); } Cell getCell = result.get(count - 1); PrivateCellUtil.setTimestamp(cell, getCell.getTimestamp()); } } @Override public void put(Put put) throws IOException { TraceUtil.trace(() -> { checkReadOnly(); // Do a rough check that we have resources to accept a write. The check is // 'rough' in that between the resource check and the call to obtain a // read lock, resources may run out. For now, the thought is that this // will be extremely rare; we'll deal with it when it happens. checkResources(); startRegionOperation(Operation.PUT); try { // All edits for the given row (across all column families) must happen atomically. return mutate(put); } finally { closeRegionOperation(Operation.PUT); } }, () -> createRegionSpan("Region.put")); } /** * Class that tracks the progress of a batch operations, accumulating status codes and tracking * the index at which processing is proceeding. These batch operations may get split into * mini-batches for processing. */ private abstract static class BatchOperation { protected final T[] operations; protected final OperationStatus[] retCodeDetails; protected final WALEdit[] walEditsFromCoprocessors; // reference family cell maps directly so coprocessors can mutate them if desired protected final Map>[] familyCellMaps; // For Increment/Append operations protected final Result[] results; protected final HRegion region; protected int nextIndexToProcess = 0; protected final ObservedExceptionsInBatch observedExceptions; // Durability of the batch (highest durability of all operations) protected Durability durability; protected boolean atomic = false; public BatchOperation(final HRegion region, T[] operations) { this.operations = operations; this.retCodeDetails = new OperationStatus[operations.length]; Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN); this.walEditsFromCoprocessors = new WALEdit[operations.length]; familyCellMaps = new Map[operations.length]; this.results = new Result[operations.length]; this.region = region; observedExceptions = new ObservedExceptionsInBatch(); durability = Durability.USE_DEFAULT; } /** * Visitor interface for batch operations */ @FunctionalInterface interface Visitor { /** * @param index operation index * @return If true continue visiting remaining entries, break otherwise */ boolean visit(int index) throws IOException; } /** * Helper method for visiting pending/ all batch operations */ public void visitBatchOperations(boolean pendingOnly, int lastIndexExclusive, Visitor visitor) throws IOException { assert lastIndexExclusive <= this.size(); for (int i = nextIndexToProcess; i < lastIndexExclusive; i++) { if (!pendingOnly || isOperationPending(i)) { if (!visitor.visit(i)) { break; } } } } public abstract Mutation getMutation(int index); public abstract long getNonceGroup(int index); public abstract long getNonce(int index); /** * This method is potentially expensive and useful mostly for non-replay CP path. */ public abstract Mutation[] getMutationsForCoprocs(); public abstract boolean isInReplay(); public abstract long getOrigLogSeqNum(); public abstract void startRegionOperation() throws IOException; public abstract void closeRegionOperation() throws IOException; /** * Validates each mutation and prepares a batch for write. If necessary (non-replay case), runs * CP prePut()/preDelete()/preIncrement()/preAppend() hooks for all mutations in a batch. This * is intended to operate on entire batch and will be called from outside of class to check and * prepare batch. This can be implemented by calling helper method * {@link #checkAndPrepareMutation(int, long)} in a 'for' loop over mutations. */ public abstract void checkAndPrepare() throws IOException; /** * Implement any Put request specific check and prepare logic here. Please refer to * {@link #checkAndPrepareMutation(Mutation, long)} for how its used. */ protected abstract void checkAndPreparePut(final Put p) throws IOException; /** * If necessary, calls preBatchMutate() CP hook for a mini-batch and updates metrics, cell * count, tags and timestamp for all cells of all operations in a mini-batch. */ public abstract void prepareMiniBatchOperations( MiniBatchOperationInProgress miniBatchOp, long timestamp, final List acquiredRowLocks) throws IOException; /** * Write mini-batch operations to MemStore */ public abstract WriteEntry writeMiniBatchOperationsToMemStore( final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry) throws IOException; protected void writeMiniBatchOperationsToMemStore( final MiniBatchOperationInProgress miniBatchOp, final long writeNumber) throws IOException { MemStoreSizing memStoreAccounting = new NonThreadSafeMemStoreSizing(); visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { // We need to update the sequence id for following reasons. // 1) If the op is in replay mode, FSWALEntry#stampRegionSequenceId won't stamp sequence id. // 2) If no WAL, FSWALEntry won't be used // we use durability of the original mutation for the mutation passed by CP. if (isInReplay() || getMutation(index).getDurability() == Durability.SKIP_WAL) { region.updateSequenceId(familyCellMaps[index].values(), writeNumber); } applyFamilyMapToMemStore(familyCellMaps[index], memStoreAccounting); return true; }); // update memStore size region.incMemStoreSize(memStoreAccounting.getDataSize(), memStoreAccounting.getHeapSize(), memStoreAccounting.getOffHeapSize(), memStoreAccounting.getCellsCount()); } public boolean isDone() { return nextIndexToProcess == operations.length; } public int size() { return operations.length; } public boolean isOperationPending(int index) { return retCodeDetails[index].getOperationStatusCode() == OperationStatusCode.NOT_RUN; } public List getClusterIds() { assert size() != 0; return getMutation(0).getClusterIds(); } boolean isAtomic() { return atomic; } /** * Helper method that checks and prepares only one mutation. This can be used to implement * {@link #checkAndPrepare()} for entire Batch. NOTE: As CP * prePut()/preDelete()/preIncrement()/preAppend() hooks may modify mutations, this method * should be called after prePut()/preDelete()/preIncrement()/preAppend() CP hooks are run for * the mutation */ protected void checkAndPrepareMutation(Mutation mutation, final long timestamp) throws IOException { region.checkRow(mutation.getRow(), "batchMutate"); if (mutation instanceof Put) { // Check the families in the put. If bad, skip this one. checkAndPreparePut((Put) mutation); region.checkTimestamps(mutation.getFamilyCellMap(), timestamp); } else if (mutation instanceof Delete) { region.prepareDelete((Delete) mutation); } else if (mutation instanceof Increment || mutation instanceof Append) { region.checkFamilies(mutation.getFamilyCellMap().keySet()); } } protected void checkAndPrepareMutation(int index, long timestamp) throws IOException { Mutation mutation = getMutation(index); try { this.checkAndPrepareMutation(mutation, timestamp); if (mutation instanceof Put || mutation instanceof Delete) { // store the family map reference to allow for mutations familyCellMaps[index] = mutation.getFamilyCellMap(); } // store durability for the batch (highest durability of all operations in the batch) Durability tmpDur = region.getEffectiveDurability(mutation.getDurability()); if (tmpDur.ordinal() > durability.ordinal()) { durability = tmpDur; } } catch (NoSuchColumnFamilyException nscfe) { final String msg = "No such column family in batch mutation in region " + this; if (observedExceptions.hasSeenNoSuchFamily()) { LOG.warn(msg + nscfe.getMessage()); } else { LOG.warn(msg, nscfe); observedExceptions.sawNoSuchFamily(); } retCodeDetails[index] = new OperationStatus(OperationStatusCode.BAD_FAMILY, nscfe.getMessage()); if (isAtomic()) { // fail, atomic means all or none throw nscfe; } } catch (FailedSanityCheckException fsce) { final String msg = "Batch Mutation did not pass sanity check in region " + this; if (observedExceptions.hasSeenFailedSanityCheck()) { LOG.warn(msg + fsce.getMessage()); } else { LOG.warn(msg, fsce); observedExceptions.sawFailedSanityCheck(); } retCodeDetails[index] = new OperationStatus(OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage()); if (isAtomic()) { throw fsce; } } catch (WrongRegionException we) { final String msg = "Batch mutation had a row that does not belong to this region " + this; if (observedExceptions.hasSeenWrongRegion()) { LOG.warn(msg + we.getMessage()); } else { LOG.warn(msg, we); observedExceptions.sawWrongRegion(); } retCodeDetails[index] = new OperationStatus(OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage()); if (isAtomic()) { throw we; } } } /** * Creates Mini-batch of all operations [nextIndexToProcess, lastIndexExclusive) for which a row * lock can be acquired. All mutations with locked rows are considered to be In-progress * operations and hence the name {@link MiniBatchOperationInProgress}. Mini batch is window over * {@link BatchOperation} and contains contiguous pending operations. * @param acquiredRowLocks keeps track of rowLocks acquired. */ public MiniBatchOperationInProgress lockRowsAndBuildMiniBatch(List acquiredRowLocks) throws IOException { int readyToWriteCount = 0; int lastIndexExclusive = 0; RowLock prevRowLock = null; for (; lastIndexExclusive < size(); lastIndexExclusive++) { // It reaches the miniBatchSize, stop here and process the miniBatch // This only applies to non-atomic batch operations. if (!isAtomic() && (readyToWriteCount == region.miniBatchSize)) { break; } if (!isOperationPending(lastIndexExclusive)) { continue; } // HBASE-19389 Limit concurrency of put with dense (hundreds) columns to avoid exhausting // RS handlers, covering both MutationBatchOperation and ReplayBatchOperation // The BAD_FAMILY/SANITY_CHECK_FAILURE cases are handled in checkAndPrepare phase and won't // pass the isOperationPending check Map> curFamilyCellMap = getMutation(lastIndexExclusive).getFamilyCellMap(); try { // start the protector before acquiring row lock considering performance, and will finish // it when encountering exception region.storeHotnessProtector.start(curFamilyCellMap); } catch (RegionTooBusyException rtbe) { region.storeHotnessProtector.finish(curFamilyCellMap); if (isAtomic()) { throw rtbe; } retCodeDetails[lastIndexExclusive] = new OperationStatus(OperationStatusCode.STORE_TOO_BUSY, rtbe.getMessage()); continue; } Mutation mutation = getMutation(lastIndexExclusive); // If we haven't got any rows in our batch, we should block to get the next one. RowLock rowLock = null; boolean throwException = false; try { // if atomic then get exclusive lock, else shared lock rowLock = region.getRowLock(mutation.getRow(), !isAtomic(), prevRowLock); } catch (TimeoutIOException | InterruptedIOException e) { // NOTE: We will retry when other exceptions, but we should stop if we receive // TimeoutIOException or InterruptedIOException as operation has timed out or // interrupted respectively. throwException = true; throw e; } catch (IOException ioe) { LOG.warn("Failed getting lock, row={}, in region {}", Bytes.toStringBinary(mutation.getRow()), this, ioe); if (isAtomic()) { // fail, atomic means all or none throwException = true; throw ioe; } } catch (Throwable throwable) { throwException = true; throw throwable; } finally { if (throwException) { region.storeHotnessProtector.finish(curFamilyCellMap); } } if (rowLock == null) { // We failed to grab another lock if (isAtomic()) { region.storeHotnessProtector.finish(curFamilyCellMap); throw new IOException("Can't apply all operations atomically!"); } break; // Stop acquiring more rows for this batch } else { if (rowLock != prevRowLock) { // It is a different row now, add this to the acquiredRowLocks and // set prevRowLock to the new returned rowLock acquiredRowLocks.add(rowLock); prevRowLock = rowLock; } } readyToWriteCount++; } return createMiniBatch(lastIndexExclusive, readyToWriteCount); } protected MiniBatchOperationInProgress createMiniBatch(final int lastIndexExclusive, final int readyToWriteCount) { return new MiniBatchOperationInProgress<>(getMutationsForCoprocs(), retCodeDetails, walEditsFromCoprocessors, nextIndexToProcess, lastIndexExclusive, readyToWriteCount); } /** * Builds separate WALEdit per nonce by applying input mutations. If WALEdits from CP are * present, they are merged to result WALEdit. */ public List> buildWALEdits(final MiniBatchOperationInProgress miniBatchOp) throws IOException { List> walEdits = new ArrayList<>(); visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), new Visitor() { private Pair curWALEditForNonce; @Override public boolean visit(int index) throws IOException { Mutation m = getMutation(index); // we use durability of the original mutation for the mutation passed by CP. if (region.getEffectiveDurability(m.getDurability()) == Durability.SKIP_WAL) { region.recordMutationWithoutWal(m.getFamilyCellMap()); return true; } // the batch may contain multiple nonce keys (replay case). If so, write WALEdit for each. // Given how nonce keys are originally written, these should be contiguous. // They don't have to be, it will still work, just write more WALEdits than needed. long nonceGroup = getNonceGroup(index); long nonce = getNonce(index); if ( curWALEditForNonce == null || curWALEditForNonce.getFirst().getNonceGroup() != nonceGroup || curWALEditForNonce.getFirst().getNonce() != nonce ) { curWALEditForNonce = new Pair<>(new NonceKey(nonceGroup, nonce), new WALEdit(miniBatchOp.getCellCount(), isInReplay())); walEdits.add(curWALEditForNonce); } WALEdit walEdit = curWALEditForNonce.getSecond(); // Add WAL edits from CPs. WALEdit fromCP = walEditsFromCoprocessors[index]; if (fromCP != null) { for (Cell cell : fromCP.getCells()) { walEdit.add(cell); } } walEdit.add(familyCellMaps[index]); return true; } }); return walEdits; } /** * This method completes mini-batch operations by calling postBatchMutate() CP hook (if * required) and completing mvcc. */ public void completeMiniBatchOperations( final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry) throws IOException { if (writeEntry != null) { region.mvcc.completeAndWait(writeEntry); } } public void doPostOpCleanupForMiniBatch( final MiniBatchOperationInProgress miniBatchOp, final WALEdit walEdit, boolean success) throws IOException { doFinishHotnessProtector(miniBatchOp); } private void doFinishHotnessProtector(final MiniBatchOperationInProgress miniBatchOp) { // check and return if the protector is not enabled if (!region.storeHotnessProtector.isEnable()) { return; } // miniBatchOp is null, if and only if lockRowsAndBuildMiniBatch throwing exception. // This case was handled. if (miniBatchOp == null) { return; } final int finalLastIndexExclusive = miniBatchOp.getLastIndexExclusive(); for (int i = nextIndexToProcess; i < finalLastIndexExclusive; i++) { switch (retCodeDetails[i].getOperationStatusCode()) { case SUCCESS: case FAILURE: region.storeHotnessProtector.finish(getMutation(i).getFamilyCellMap()); break; default: // do nothing // We won't start the protector for NOT_RUN/BAD_FAMILY/SANITY_CHECK_FAILURE and the // STORE_TOO_BUSY case is handled in StoreHotnessProtector#start break; } } } /** * Atomically apply the given map of family->edits to the memstore. This handles the consistency * control on its own, but the caller should already have locked updatesLock.readLock(). This * also does not check the families for validity. * @param familyMap Map of Cells by family */ protected void applyFamilyMapToMemStore(Map> familyMap, MemStoreSizing memstoreAccounting) throws IOException { for (Map.Entry> e : familyMap.entrySet()) { byte[] family = e.getKey(); List cells = e.getValue(); assert cells instanceof RandomAccess; region.applyToMemStore(region.getStore(family), cells, false, memstoreAccounting); } } } /** * Batch of mutation operations. Base class is shared with {@link ReplayBatchOperation} as most of * the logic is same. */ private static class MutationBatchOperation extends BatchOperation { // For nonce operations private long nonceGroup; private long nonce; protected boolean canProceed; public MutationBatchOperation(final HRegion region, Mutation[] operations, boolean atomic, long nonceGroup, long nonce) { super(region, operations); this.atomic = atomic; this.nonceGroup = nonceGroup; this.nonce = nonce; } @Override public Mutation getMutation(int index) { return this.operations[index]; } @Override public long getNonceGroup(int index) { return nonceGroup; } @Override public long getNonce(int index) { return nonce; } @Override public Mutation[] getMutationsForCoprocs() { return this.operations; } @Override public boolean isInReplay() { return false; } @Override public long getOrigLogSeqNum() { return SequenceId.NO_SEQUENCE_ID; } @Override public void startRegionOperation() throws IOException { region.startRegionOperation(Operation.BATCH_MUTATE); } @Override public void closeRegionOperation() throws IOException { region.closeRegionOperation(Operation.BATCH_MUTATE); } @Override public void checkAndPreparePut(Put p) throws IOException { region.checkFamilies(p.getFamilyCellMap().keySet()); } @Override public void checkAndPrepare() throws IOException { // index 0: puts, index 1: deletes, index 2: increments, index 3: append final int[] metrics = { 0, 0, 0, 0 }; visitBatchOperations(true, this.size(), new Visitor() { private long now = EnvironmentEdgeManager.currentTime(); private WALEdit walEdit; @Override public boolean visit(int index) throws IOException { // Run coprocessor pre hook outside of locks to avoid deadlock if (region.coprocessorHost != null) { if (walEdit == null) { walEdit = new WALEdit(); } callPreMutateCPHook(index, walEdit, metrics); if (!walEdit.isEmpty()) { walEditsFromCoprocessors[index] = walEdit; walEdit = null; } } if (isOperationPending(index)) { // TODO: Currently validation is done with current time before acquiring locks and // updates are done with different timestamps after acquiring locks. This behavior is // inherited from the code prior to this change. Can this be changed? checkAndPrepareMutation(index, now); } return true; } }); // FIXME: we may update metrics twice! here for all operations bypassed by CP and later in // normal processing. // Update metrics in same way as it is done when we go the normal processing route (we now // update general metrics though a Coprocessor did the work). if (region.metricsRegion != null) { if (metrics[0] > 0) { // There were some Puts in the batch. region.metricsRegion.updatePut(); } if (metrics[1] > 0) { // There were some Deletes in the batch. region.metricsRegion.updateDelete(); } if (metrics[2] > 0) { // There were some Increment in the batch. region.metricsRegion.updateIncrement(); } if (metrics[3] > 0) { // There were some Append in the batch. region.metricsRegion.updateAppend(); } } } @Override public void prepareMiniBatchOperations(MiniBatchOperationInProgress miniBatchOp, long timestamp, final List acquiredRowLocks) throws IOException { // For nonce operations canProceed = startNonceOperation(); visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { Mutation mutation = getMutation(index); if (mutation instanceof Put) { HRegion.updateCellTimestamps(familyCellMaps[index].values(), Bytes.toBytes(timestamp)); miniBatchOp.incrementNumOfPuts(); } else if (mutation instanceof Delete) { region.prepareDeleteTimestamps(mutation, familyCellMaps[index], Bytes.toBytes(timestamp)); miniBatchOp.incrementNumOfDeletes(); } else if (mutation instanceof Increment || mutation instanceof Append) { boolean returnResults; if (mutation instanceof Increment) { returnResults = ((Increment) mutation).isReturnResults(); } else { returnResults = ((Append) mutation).isReturnResults(); } // For nonce operations if (!canProceed) { Result result; if (returnResults) { // convert duplicate increment/append to get List results = region.get(toGet(mutation), false, nonceGroup, nonce); result = Result.create(results); } else { result = Result.EMPTY_RESULT; } retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result); return true; } Result result = null; if (region.coprocessorHost != null) { if (mutation instanceof Increment) { result = region.coprocessorHost.preIncrementAfterRowLock((Increment) mutation); } else { result = region.coprocessorHost.preAppendAfterRowLock((Append) mutation); } } if (result != null) { retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, returnResults ? result : Result.EMPTY_RESULT); return true; } List results = returnResults ? new ArrayList<>(mutation.size()) : null; familyCellMaps[index] = reckonDeltas(mutation, results, timestamp); this.results[index] = results != null ? Result.create(results) : Result.EMPTY_RESULT; if (mutation instanceof Increment) { miniBatchOp.incrementNumOfIncrements(); } else { miniBatchOp.incrementNumOfAppends(); } } region.rewriteCellTags(familyCellMaps[index], mutation); // update cell count if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) { for (List cells : mutation.getFamilyCellMap().values()) { miniBatchOp.addCellCount(cells.size()); } } WALEdit fromCP = walEditsFromCoprocessors[index]; if (fromCP != null) { miniBatchOp.addCellCount(fromCP.size()); } return true; }); if (region.coprocessorHost != null) { // calling the pre CP hook for batch mutation region.coprocessorHost.preBatchMutate(miniBatchOp); checkAndMergeCPMutations(miniBatchOp, acquiredRowLocks, timestamp); } } /** * Starts the nonce operation for a mutation, if needed. * @return whether to proceed this mutation. */ private boolean startNonceOperation() throws IOException { if ( region.rsServices == null || region.rsServices.getNonceManager() == null || nonce == HConstants.NO_NONCE ) { return true; } boolean canProceed; try { canProceed = region.rsServices.getNonceManager().startOperation(nonceGroup, nonce, region.rsServices); } catch (InterruptedException ex) { throw new InterruptedIOException("Nonce start operation interrupted"); } return canProceed; } /** * Ends nonce operation for a mutation, if needed. * @param success Whether the operation for this nonce has succeeded. */ private void endNonceOperation(boolean success) { if ( region.rsServices != null && region.rsServices.getNonceManager() != null && nonce != HConstants.NO_NONCE ) { region.rsServices.getNonceManager().endOperation(nonceGroup, nonce, success); } } private static Get toGet(final Mutation mutation) throws IOException { assert mutation instanceof Increment || mutation instanceof Append; Get get = new Get(mutation.getRow()); CellScanner cellScanner = mutation.cellScanner(); while (cellScanner.advance()) { Cell cell = cellScanner.current(); get.addColumn(CellUtil.cloneFamily(cell), CellUtil.cloneQualifier(cell)); } if (mutation instanceof Increment) { // Increment Increment increment = (Increment) mutation; get.setTimeRange(increment.getTimeRange().getMin(), increment.getTimeRange().getMax()); } else { // Append Append append = (Append) mutation; get.setTimeRange(append.getTimeRange().getMin(), append.getTimeRange().getMax()); } for (Entry entry : mutation.getAttributesMap().entrySet()) { get.setAttribute(entry.getKey(), entry.getValue()); } return get; } private Map> reckonDeltas(Mutation mutation, List results, long now) throws IOException { assert mutation instanceof Increment || mutation instanceof Append; Map> ret = new TreeMap<>(Bytes.BYTES_COMPARATOR); // Process a Store/family at a time. for (Map.Entry> entry : mutation.getFamilyCellMap().entrySet()) { final byte[] columnFamilyName = entry.getKey(); List deltas = entry.getValue(); // Reckon for the Store what to apply to WAL and MemStore. List toApply = reckonDeltasByStore(region.stores.get(columnFamilyName), mutation, now, deltas, results); if (!toApply.isEmpty()) { for (Cell cell : toApply) { HStore store = region.getStore(cell); if (store == null) { region.checkFamily(CellUtil.cloneFamily(cell)); } else { ret.computeIfAbsent(store.getColumnFamilyDescriptor().getName(), key -> new ArrayList<>()).add(cell); } } } } return ret; } /** * Reckon the Cells to apply to WAL, memstore, and to return to the Client in passed column * family/Store. Does Get of current value and then adds passed in deltas for this Store * returning the result. * @param mutation The encompassing Mutation object * @param deltas Changes to apply to this Store; either increment amount or data to append * @param results In here we accumulate all the Cells we are to return to the client. If null, * client doesn't want results returned. * @return Resulting Cells after deltas have been applied to current values. Side * effect is our filling out of the results List. */ private List reckonDeltasByStore(HStore store, Mutation mutation, long now, List deltas, List results) throws IOException { assert mutation instanceof Increment || mutation instanceof Append; byte[] columnFamily = store.getColumnFamilyDescriptor().getName(); List> cellPairs = new ArrayList<>(deltas.size()); // Sort the cells so that they match the order that they appear in the Get results. // Otherwise, we won't be able to find the existing values if the cells are not specified // in order by the client since cells are in an array list. deltas.sort(store.getComparator()); // Get previous values for all columns in this family. Get get = new Get(mutation.getRow()); for (Cell cell : deltas) { get.addColumn(columnFamily, CellUtil.cloneQualifier(cell)); } TimeRange tr; if (mutation instanceof Increment) { tr = ((Increment) mutation).getTimeRange(); } else { tr = ((Append) mutation).getTimeRange(); } if (tr != null) { get.setTimeRange(tr.getMin(), tr.getMax()); } try (RegionScanner scanner = region.getScanner(new Scan(get))) { // NOTE: Please don't use HRegion.get() instead, // because it will copy cells to heap. See HBASE-26036 List currentValues = new ArrayList<>(); scanner.next(currentValues); // Iterate the input columns and update existing values if they were found, otherwise // add new column initialized to the delta amount int currentValuesIndex = 0; for (int i = 0; i < deltas.size(); i++) { Cell delta = deltas.get(i); Cell currentValue = null; if ( currentValuesIndex < currentValues.size() && CellUtil.matchingQualifier(currentValues.get(currentValuesIndex), delta) ) { currentValue = currentValues.get(currentValuesIndex); if (i < (deltas.size() - 1) && !CellUtil.matchingQualifier(delta, deltas.get(i + 1))) { currentValuesIndex++; } } // Switch on whether this an increment or an append building the new Cell to apply. Cell newCell; if (mutation instanceof Increment) { long deltaAmount = getLongValue(delta); final long newValue = currentValue == null ? deltaAmount : getLongValue(currentValue) + deltaAmount; newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, (oldCell) -> Bytes.toBytes(newValue)); } else { newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, (oldCell) -> ByteBuffer .wrap(new byte[delta.getValueLength() + oldCell.getValueLength()]) .put(oldCell.getValueArray(), oldCell.getValueOffset(), oldCell.getValueLength()) .put(delta.getValueArray(), delta.getValueOffset(), delta.getValueLength()) .array()); } if (region.maxCellSize > 0) { int newCellSize = PrivateCellUtil.estimatedSerializedSizeOf(newCell); if (newCellSize > region.maxCellSize) { String msg = "Cell with size " + newCellSize + " exceeds limit of " + region.maxCellSize + " bytes in region " + this; LOG.debug(msg); throw new DoNotRetryIOException(msg); } } cellPairs.add(new Pair<>(currentValue, newCell)); // Add to results to get returned to the Client. If null, cilent does not want results. if (results != null) { results.add(newCell); } } // Give coprocessors a chance to update the new cells before apply to WAL or memstore if (region.coprocessorHost != null) { // Here the operation must be increment or append. cellPairs = mutation instanceof Increment ? region.coprocessorHost.postIncrementBeforeWAL(mutation, cellPairs) : region.coprocessorHost.postAppendBeforeWAL(mutation, cellPairs); } } return cellPairs.stream().map(Pair::getSecond).collect(Collectors.toList()); } private static Cell reckonDelta(final Cell delta, final Cell currentCell, final byte[] columnFamily, final long now, Mutation mutation, Function supplier) throws IOException { // Forward any tags found on the delta. List tags = TagUtil.carryForwardTags(delta); if (currentCell != null) { tags = TagUtil.carryForwardTags(tags, currentCell); tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL()); byte[] newValue = supplier.apply(currentCell); return ExtendedCellBuilderFactory.create(CellBuilderType.SHALLOW_COPY) .setRow(mutation.getRow(), 0, mutation.getRow().length) .setFamily(columnFamily, 0, columnFamily.length) // copy the qualifier if the cell is located in shared memory. .setQualifier(CellUtil.cloneQualifier(delta)) .setTimestamp(Math.max(currentCell.getTimestamp() + 1, now)) .setType(KeyValue.Type.Put.getCode()).setValue(newValue, 0, newValue.length) .setTags(TagUtil.fromList(tags)).build(); } else { tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL()); PrivateCellUtil.updateLatestStamp(delta, now); return CollectionUtils.isEmpty(tags) ? delta : PrivateCellUtil.createCell(delta, tags); } } /** Returns Get the long out of the passed in Cell */ private static long getLongValue(final Cell cell) throws DoNotRetryIOException { int len = cell.getValueLength(); if (len != Bytes.SIZEOF_LONG) { // throw DoNotRetryIOException instead of IllegalArgumentException throw new DoNotRetryIOException("Field is not a long, it's " + len + " bytes wide"); } return PrivateCellUtil.getValueAsLong(cell); } @Override public List> buildWALEdits(final MiniBatchOperationInProgress miniBatchOp) throws IOException { List> walEdits = super.buildWALEdits(miniBatchOp); // for MutationBatchOperation, more than one nonce is not allowed if (walEdits.size() > 1) { throw new IOException("Found multiple nonce keys per batch!"); } return walEdits; } @Override public WriteEntry writeMiniBatchOperationsToMemStore( final MiniBatchOperationInProgress miniBatchOp, @Nullable WriteEntry writeEntry) throws IOException { if (writeEntry == null) { writeEntry = region.mvcc.begin(); } super.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry.getWriteNumber()); return writeEntry; } @Override public void completeMiniBatchOperations( final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry) throws IOException { // TODO: can it be done after completing mvcc? // calling the post CP hook for batch mutation if (region.coprocessorHost != null) { region.coprocessorHost.postBatchMutate(miniBatchOp); } super.completeMiniBatchOperations(miniBatchOp, writeEntry); if (nonce != HConstants.NO_NONCE) { if (region.rsServices != null && region.rsServices.getNonceManager() != null) { region.rsServices.getNonceManager().addMvccToOperationContext(nonceGroup, nonce, writeEntry.getWriteNumber()); } } } @Override public void doPostOpCleanupForMiniBatch(MiniBatchOperationInProgress miniBatchOp, final WALEdit walEdit, boolean success) throws IOException { super.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, success); if (miniBatchOp != null) { // synced so that the coprocessor contract is adhered to. if (region.coprocessorHost != null) { visitBatchOperations(false, miniBatchOp.getLastIndexExclusive(), (int i) -> { // only for successful puts/deletes/increments/appends if (retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.SUCCESS) { Mutation m = getMutation(i); if (m instanceof Put) { region.coprocessorHost.postPut((Put) m, walEdit); } else if (m instanceof Delete) { region.coprocessorHost.postDelete((Delete) m, walEdit); } else if (m instanceof Increment) { Result result = region.getCoprocessorHost().postIncrement((Increment) m, results[i], walEdit); if (result != results[i]) { retCodeDetails[i] = new OperationStatus(retCodeDetails[i].getOperationStatusCode(), result); } } else if (m instanceof Append) { Result result = region.getCoprocessorHost().postAppend((Append) m, results[i], walEdit); if (result != results[i]) { retCodeDetails[i] = new OperationStatus(retCodeDetails[i].getOperationStatusCode(), result); } } } return true; }); } // For nonce operations if (canProceed && nonce != HConstants.NO_NONCE) { boolean[] areAllIncrementsAndAppendsSuccessful = new boolean[] { true }; visitBatchOperations(false, miniBatchOp.getLastIndexExclusive(), (int i) -> { Mutation mutation = getMutation(i); if (mutation instanceof Increment || mutation instanceof Append) { if (retCodeDetails[i].getOperationStatusCode() != OperationStatusCode.SUCCESS) { areAllIncrementsAndAppendsSuccessful[0] = false; return false; } } return true; }); endNonceOperation(areAllIncrementsAndAppendsSuccessful[0]); } // See if the column families were consistent through the whole thing. // if they were then keep them. If they were not then pass a null. // null will be treated as unknown. // Total time taken might be involving Puts, Deletes, Increments and Appends. // Split the time for puts and deletes based on the total number of Puts, Deletes, // Increments and Appends. if (region.metricsRegion != null) { if (miniBatchOp.getNumOfPuts() > 0) { // There were some Puts in the batch. region.metricsRegion.updatePut(); } if (miniBatchOp.getNumOfDeletes() > 0) { // There were some Deletes in the batch. region.metricsRegion.updateDelete(); } if (miniBatchOp.getNumOfIncrements() > 0) { // There were some Increments in the batch. region.metricsRegion.updateIncrement(); } if (miniBatchOp.getNumOfAppends() > 0) { // There were some Appends in the batch. region.metricsRegion.updateAppend(); } } } if (region.coprocessorHost != null) { // call the coprocessor hook to do any finalization steps after the put is done region.coprocessorHost.postBatchMutateIndispensably( miniBatchOp != null ? miniBatchOp : createMiniBatch(size(), 0), success); } } /** * Runs prePut/preDelete/preIncrement/preAppend coprocessor hook for input mutation in a batch * @param metrics Array of 2 ints. index 0: count of puts, index 1: count of deletes, index 2: * count of increments and 3: count of appends */ private void callPreMutateCPHook(int index, final WALEdit walEdit, final int[] metrics) throws IOException { Mutation m = getMutation(index); if (m instanceof Put) { if (region.coprocessorHost.prePut((Put) m, walEdit)) { // pre hook says skip this Put // mark as success and skip in doMiniBatchMutation metrics[0]++; retCodeDetails[index] = OperationStatus.SUCCESS; } } else if (m instanceof Delete) { Delete curDel = (Delete) m; if (curDel.getFamilyCellMap().isEmpty()) { // handle deleting a row case // TODO: prepareDelete() has been called twice, before and after preDelete() CP hook. // Can this be avoided? region.prepareDelete(curDel); } if (region.coprocessorHost.preDelete(curDel, walEdit)) { // pre hook says skip this Delete // mark as success and skip in doMiniBatchMutation metrics[1]++; retCodeDetails[index] = OperationStatus.SUCCESS; } } else if (m instanceof Increment) { Increment increment = (Increment) m; Result result = region.coprocessorHost.preIncrement(increment, walEdit); if (result != null) { // pre hook says skip this Increment // mark as success and skip in doMiniBatchMutation metrics[2]++; retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result); } } else if (m instanceof Append) { Append append = (Append) m; Result result = region.coprocessorHost.preAppend(append, walEdit); if (result != null) { // pre hook says skip this Append // mark as success and skip in doMiniBatchMutation metrics[3]++; retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result); } } else { String msg = "Put/Delete/Increment/Append mutations only supported in a batch"; retCodeDetails[index] = new OperationStatus(OperationStatusCode.FAILURE, msg); if (isAtomic()) { // fail, atomic means all or none throw new IOException(msg); } } } // TODO Support Increment/Append operations private void checkAndMergeCPMutations(final MiniBatchOperationInProgress miniBatchOp, final List acquiredRowLocks, final long timestamp) throws IOException { visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), (int i) -> { // we pass (i - firstIndex) below since the call expects a relative index Mutation[] cpMutations = miniBatchOp.getOperationsFromCoprocessors(i - nextIndexToProcess); if (cpMutations == null) { return true; } // Else Coprocessor added more Mutations corresponding to the Mutation at this index. Mutation mutation = getMutation(i); for (Mutation cpMutation : cpMutations) { this.checkAndPrepareMutation(cpMutation, timestamp); // Acquire row locks. If not, the whole batch will fail. acquiredRowLocks.add(region.getRowLock(cpMutation.getRow(), true, null)); // Returned mutations from coprocessor correspond to the Mutation at index i. We can // directly add the cells from those mutations to the familyMaps of this mutation. Map> cpFamilyMap = cpMutation.getFamilyCellMap(); region.rewriteCellTags(cpFamilyMap, mutation); // will get added to the memStore later mergeFamilyMaps(familyCellMaps[i], cpFamilyMap); // The durability of returned mutation is replaced by the corresponding mutation. // If the corresponding mutation contains the SKIP_WAL, we shouldn't count the // cells of returned mutation. if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) { for (List cells : cpFamilyMap.values()) { miniBatchOp.addCellCount(cells.size()); } } } return true; }); } private void mergeFamilyMaps(Map> familyMap, Map> toBeMerged) { for (Map.Entry> entry : toBeMerged.entrySet()) { List cells = familyMap.get(entry.getKey()); if (cells == null) { familyMap.put(entry.getKey(), entry.getValue()); } else { cells.addAll(entry.getValue()); } } } } /** * Batch of mutations for replay. Base class is shared with {@link MutationBatchOperation} as most * of the logic is same. */ private static final class ReplayBatchOperation extends BatchOperation { private long origLogSeqNum = 0; public ReplayBatchOperation(final HRegion region, MutationReplay[] operations, long origLogSeqNum) { super(region, operations); this.origLogSeqNum = origLogSeqNum; } @Override public Mutation getMutation(int index) { return this.operations[index].mutation; } @Override public long getNonceGroup(int index) { return this.operations[index].nonceGroup; } @Override public long getNonce(int index) { return this.operations[index].nonce; } @Override public Mutation[] getMutationsForCoprocs() { return null; } @Override public boolean isInReplay() { return true; } @Override public long getOrigLogSeqNum() { return this.origLogSeqNum; } @Override public void startRegionOperation() throws IOException { region.startRegionOperation(Operation.REPLAY_BATCH_MUTATE); } @Override public void closeRegionOperation() throws IOException { region.closeRegionOperation(Operation.REPLAY_BATCH_MUTATE); } /** * During replay, there could exist column families which are removed between region server * failure and replay */ @Override protected void checkAndPreparePut(Put p) throws IOException { Map> familyCellMap = p.getFamilyCellMap(); List nonExistentList = null; for (byte[] family : familyCellMap.keySet()) { if (!region.htableDescriptor.hasColumnFamily(family)) { if (nonExistentList == null) { nonExistentList = new ArrayList<>(); } nonExistentList.add(family); } } if (nonExistentList != null) { for (byte[] family : nonExistentList) { // Perhaps schema was changed between crash and replay LOG.info("No family for {} omit from reply in region {}.", Bytes.toString(family), this); familyCellMap.remove(family); } } } @Override public void checkAndPrepare() throws IOException { long now = EnvironmentEdgeManager.currentTime(); visitBatchOperations(true, this.size(), (int index) -> { checkAndPrepareMutation(index, now); return true; }); } @Override public void prepareMiniBatchOperations(MiniBatchOperationInProgress miniBatchOp, long timestamp, final List acquiredRowLocks) throws IOException { visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { // update cell count for (List cells : getMutation(index).getFamilyCellMap().values()) { miniBatchOp.addCellCount(cells.size()); } return true; }); } @Override public WriteEntry writeMiniBatchOperationsToMemStore( final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry) throws IOException { super.writeMiniBatchOperationsToMemStore(miniBatchOp, getOrigLogSeqNum()); return writeEntry; } @Override public void completeMiniBatchOperations( final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry) throws IOException { super.completeMiniBatchOperations(miniBatchOp, writeEntry); region.mvcc.advanceTo(getOrigLogSeqNum()); } } public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, long nonceGroup, long nonce) throws IOException { // As it stands, this is used for 3 things // * batchMutate with single mutation - put/delete/increment/append, separate or from // checkAndMutate. // * coprocessor calls (see ex. BulkDeleteEndpoint). // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd... return batchMutate(new MutationBatchOperation(this, mutations, atomic, nonceGroup, nonce)); } @Override public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException { // If the mutations has any Increment/Append operations, we need to do batchMutate atomically boolean atomic = Arrays.stream(mutations).anyMatch(m -> m instanceof Increment || m instanceof Append); return batchMutate(mutations, atomic); } OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic) throws IOException { return TraceUtil.trace( () -> batchMutate(mutations, atomic, HConstants.NO_NONCE, HConstants.NO_NONCE), () -> createRegionSpan("Region.batchMutate")); } public OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId) throws IOException { if ( !RegionReplicaUtil.isDefaultReplica(getRegionInfo()) && replaySeqId < lastReplayedOpenRegionSeqId ) { // if it is a secondary replica we should ignore these entries silently // since they are coming out of order if (LOG.isTraceEnabled()) { LOG.trace(getRegionInfo().getEncodedName() + " : " + "Skipping " + mutations.length + " mutations with replaySeqId=" + replaySeqId + " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId); for (MutationReplay mut : mutations) { LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation); } } OperationStatus[] statuses = new OperationStatus[mutations.length]; for (int i = 0; i < statuses.length; i++) { statuses[i] = OperationStatus.SUCCESS; } return statuses; } return batchMutate(new ReplayBatchOperation(this, mutations, replaySeqId)); } /** * Perform a batch of mutations. *

* Operations in a batch are stored with highest durability specified of for all operations in a * batch, except for {@link Durability#SKIP_WAL}. *

* This function is called from {@link #batchReplay(WALSplitUtil.MutationReplay[], long)} with * {@link ReplayBatchOperation} instance and {@link #batchMutate(Mutation[])} with * {@link MutationBatchOperation} instance as an argument. As the processing of replay batch and * mutation batch is very similar, lot of code is shared by providing generic methods in base * class {@link BatchOperation}. The logic for this method and * {@link #doMiniBatchMutate(BatchOperation)} is implemented using methods in base class which are * overridden by derived classes to implement special behavior. * @param batchOp contains the list of mutations * @return an array of OperationStatus which internally contains the OperationStatusCode and the * exceptionMessage if any. * @throws IOException if an IO problem is encountered */ private OperationStatus[] batchMutate(BatchOperation batchOp) throws IOException { boolean initialized = false; batchOp.startRegionOperation(); try { while (!batchOp.isDone()) { if (!batchOp.isInReplay()) { checkReadOnly(); } checkResources(); if (!initialized) { this.writeRequestsCount.add(batchOp.size()); // validate and prepare batch for write, for MutationBatchOperation it also calls CP // prePut()/preDelete()/preIncrement()/preAppend() hooks batchOp.checkAndPrepare(); initialized = true; } doMiniBatchMutate(batchOp); requestFlushIfNeeded(); } } finally { if (rsServices != null && rsServices.getMetrics() != null) { rsServices.getMetrics().updateWriteQueryMeter(this.htableDescriptor.getTableName(), batchOp.size()); } batchOp.closeRegionOperation(); } return batchOp.retCodeDetails; } /** * Called to do a piece of the batch that came in to {@link #batchMutate(Mutation[])} In here we * also handle replay of edits on region recover. Also gets change in size brought about by * applying {@code batchOp}. */ private void doMiniBatchMutate(BatchOperation batchOp) throws IOException { boolean success = false; WALEdit walEdit = null; WriteEntry writeEntry = null; boolean locked = false; // We try to set up a batch in the range [batchOp.nextIndexToProcess,lastIndexExclusive) MiniBatchOperationInProgress miniBatchOp = null; /** Keep track of the locks we hold so we can release them in finally clause */ List acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.size()); // Check for thread interrupt status in case we have been signaled from // #interruptRegionOperation. checkInterrupt(); try { // STEP 1. Try to acquire as many locks as we can and build mini-batch of operations with // locked rows miniBatchOp = batchOp.lockRowsAndBuildMiniBatch(acquiredRowLocks); // We've now grabbed as many mutations off the list as we can // Ensure we acquire at least one. if (miniBatchOp.getReadyToWriteCount() <= 0) { // Nothing to put/delete/increment/append -- an exception in the above such as // NoSuchColumnFamily? return; } // Check for thread interrupt status in case we have been signaled from // #interruptRegionOperation. Do it before we take the lock and disable interrupts for // the WAL append. checkInterrupt(); lock(this.updatesLock.readLock(), miniBatchOp.getReadyToWriteCount()); locked = true; // From this point until memstore update this operation should not be interrupted. disableInterrupts(); // STEP 2. Update mini batch of all operations in progress with LATEST_TIMESTAMP timestamp // We should record the timestamp only after we have acquired the rowLock, // otherwise, newer puts/deletes/increment/append are not guaranteed to have a newer // timestamp long now = EnvironmentEdgeManager.currentTime(); batchOp.prepareMiniBatchOperations(miniBatchOp, now, acquiredRowLocks); // STEP 3. Build WAL edit List> walEdits = batchOp.buildWALEdits(miniBatchOp); // STEP 4. Append the WALEdits to WAL and sync. for (Iterator> it = walEdits.iterator(); it.hasNext();) { Pair nonceKeyWALEditPair = it.next(); walEdit = nonceKeyWALEditPair.getSecond(); NonceKey nonceKey = nonceKeyWALEditPair.getFirst(); if (walEdit != null && !walEdit.isEmpty()) { writeEntry = doWALAppend(walEdit, batchOp.durability, batchOp.getClusterIds(), now, nonceKey.getNonceGroup(), nonceKey.getNonce(), batchOp.getOrigLogSeqNum()); } // Complete mvcc for all but last writeEntry (for replay case) if (it.hasNext() && writeEntry != null) { mvcc.complete(writeEntry); writeEntry = null; } } // STEP 5. Write back to memStore // NOTE: writeEntry can be null here writeEntry = batchOp.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry); // STEP 6. Complete MiniBatchOperations: If required calls postBatchMutate() CP hook and // complete mvcc for last writeEntry batchOp.completeMiniBatchOperations(miniBatchOp, writeEntry); writeEntry = null; success = true; } finally { // Call complete rather than completeAndWait because we probably had error if walKey != null if (writeEntry != null) mvcc.complete(writeEntry); if (locked) { this.updatesLock.readLock().unlock(); } releaseRowLocks(acquiredRowLocks); enableInterrupts(); final int finalLastIndexExclusive = miniBatchOp != null ? miniBatchOp.getLastIndexExclusive() : batchOp.size(); final boolean finalSuccess = success; batchOp.visitBatchOperations(true, finalLastIndexExclusive, (int i) -> { Mutation mutation = batchOp.getMutation(i); if (mutation instanceof Increment || mutation instanceof Append) { if (finalSuccess) { batchOp.retCodeDetails[i] = new OperationStatus(OperationStatusCode.SUCCESS, batchOp.results[i]); } else { batchOp.retCodeDetails[i] = OperationStatus.FAILURE; } } else { batchOp.retCodeDetails[i] = finalSuccess ? OperationStatus.SUCCESS : OperationStatus.FAILURE; } return true; }); batchOp.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, finalSuccess); batchOp.nextIndexToProcess = finalLastIndexExclusive; } } /** * Returns effective durability from the passed durability and the table descriptor. */ private Durability getEffectiveDurability(Durability d) { return d == Durability.USE_DEFAULT ? this.regionDurability : d; } @Override @Deprecated public boolean checkAndMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op, ByteArrayComparable comparator, TimeRange timeRange, Mutation mutation) throws IOException { CheckAndMutate checkAndMutate; try { CheckAndMutate.Builder builder = CheckAndMutate.newBuilder(row) .ifMatches(family, qualifier, op, comparator.getValue()).timeRange(timeRange); if (mutation instanceof Put) { checkAndMutate = builder.build((Put) mutation); } else if (mutation instanceof Delete) { checkAndMutate = builder.build((Delete) mutation); } else { throw new DoNotRetryIOException( "Unsupported mutate type: " + mutation.getClass().getSimpleName().toUpperCase()); } } catch (IllegalArgumentException e) { throw new DoNotRetryIOException(e.getMessage()); } return checkAndMutate(checkAndMutate).isSuccess(); } @Override @Deprecated public boolean checkAndMutate(byte[] row, Filter filter, TimeRange timeRange, Mutation mutation) throws IOException { CheckAndMutate checkAndMutate; try { CheckAndMutate.Builder builder = CheckAndMutate.newBuilder(row).ifMatches(filter).timeRange(timeRange); if (mutation instanceof Put) { checkAndMutate = builder.build((Put) mutation); } else if (mutation instanceof Delete) { checkAndMutate = builder.build((Delete) mutation); } else { throw new DoNotRetryIOException( "Unsupported mutate type: " + mutation.getClass().getSimpleName().toUpperCase()); } } catch (IllegalArgumentException e) { throw new DoNotRetryIOException(e.getMessage()); } return checkAndMutate(checkAndMutate).isSuccess(); } @Override @Deprecated public boolean checkAndRowMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op, ByteArrayComparable comparator, TimeRange timeRange, RowMutations rm) throws IOException { CheckAndMutate checkAndMutate; try { checkAndMutate = CheckAndMutate.newBuilder(row) .ifMatches(family, qualifier, op, comparator.getValue()).timeRange(timeRange).build(rm); } catch (IllegalArgumentException e) { throw new DoNotRetryIOException(e.getMessage()); } return checkAndMutate(checkAndMutate).isSuccess(); } @Override @Deprecated public boolean checkAndRowMutate(byte[] row, Filter filter, TimeRange timeRange, RowMutations rm) throws IOException { CheckAndMutate checkAndMutate; try { checkAndMutate = CheckAndMutate.newBuilder(row).ifMatches(filter).timeRange(timeRange).build(rm); } catch (IllegalArgumentException e) { throw new DoNotRetryIOException(e.getMessage()); } return checkAndMutate(checkAndMutate).isSuccess(); } @Override public CheckAndMutateResult checkAndMutate(CheckAndMutate checkAndMutate) throws IOException { return checkAndMutate(checkAndMutate, HConstants.NO_NONCE, HConstants.NO_NONCE); } public CheckAndMutateResult checkAndMutate(CheckAndMutate checkAndMutate, long nonceGroup, long nonce) throws IOException { return TraceUtil.trace(() -> checkAndMutateInternal(checkAndMutate, nonceGroup, nonce), () -> createRegionSpan("Region.checkAndMutate")); } private CheckAndMutateResult checkAndMutateInternal(CheckAndMutate checkAndMutate, long nonceGroup, long nonce) throws IOException { byte[] row = checkAndMutate.getRow(); Filter filter = null; byte[] family = null; byte[] qualifier = null; CompareOperator op = null; ByteArrayComparable comparator = null; if (checkAndMutate.hasFilter()) { filter = checkAndMutate.getFilter(); } else { family = checkAndMutate.getFamily(); qualifier = checkAndMutate.getQualifier(); op = checkAndMutate.getCompareOp(); comparator = new BinaryComparator(checkAndMutate.getValue()); } TimeRange timeRange = checkAndMutate.getTimeRange(); Mutation mutation = null; RowMutations rowMutations = null; if (checkAndMutate.getAction() instanceof Mutation) { mutation = (Mutation) checkAndMutate.getAction(); } else { rowMutations = (RowMutations) checkAndMutate.getAction(); } if (mutation != null) { checkMutationType(mutation); checkRow(mutation, row); } else { checkRow(rowMutations, row); } checkReadOnly(); // TODO, add check for value length also move this check to the client checkResources(); startRegionOperation(); try { Get get = new Get(row); if (family != null) { checkFamily(family); get.addColumn(family, qualifier); } if (filter != null) { get.setFilter(filter); } if (timeRange != null) { get.setTimeRange(timeRange.getMin(), timeRange.getMax()); } // Lock row - note that doBatchMutate will relock this row if called checkRow(row, "doCheckAndRowMutate"); RowLock rowLock = getRowLock(get.getRow(), false, null); try { if (this.getCoprocessorHost() != null) { CheckAndMutateResult result = getCoprocessorHost().preCheckAndMutateAfterRowLock(checkAndMutate); if (result != null) { return result; } } // NOTE: We used to wait here until mvcc caught up: mvcc.await(); // Supposition is that now all changes are done under row locks, then when we go to read, // we'll get the latest on this row. boolean matches = false; long cellTs = 0; try (RegionScanner scanner = getScanner(new Scan(get))) { // NOTE: Please don't use HRegion.get() instead, // because it will copy cells to heap. See HBASE-26036 List result = new ArrayList<>(1); scanner.next(result); if (filter != null) { if (!result.isEmpty()) { matches = true; cellTs = result.get(0).getTimestamp(); } } else { boolean valueIsNull = comparator.getValue() == null || comparator.getValue().length == 0; if (result.isEmpty() && valueIsNull) { matches = op != CompareOperator.NOT_EQUAL; } else if (result.size() > 0 && valueIsNull) { matches = (result.get(0).getValueLength() == 0) == (op != CompareOperator.NOT_EQUAL); cellTs = result.get(0).getTimestamp(); } else if (result.size() == 1) { Cell kv = result.get(0); cellTs = kv.getTimestamp(); int compareResult = PrivateCellUtil.compareValue(kv, comparator); matches = matches(op, compareResult); } } } // If matches, perform the mutation or the rowMutations if (matches) { // We have acquired the row lock already. If the system clock is NOT monotonically // non-decreasing (see HBASE-14070) we should make sure that the mutation has a // larger timestamp than what was observed via Get. doBatchMutate already does this, but // there is no way to pass the cellTs. See HBASE-14054. long now = EnvironmentEdgeManager.currentTime(); long ts = Math.max(now, cellTs); // ensure write is not eclipsed byte[] byteTs = Bytes.toBytes(ts); if (mutation != null) { if (mutation instanceof Put) { updateCellTimestamps(mutation.getFamilyCellMap().values(), byteTs); } // And else 'delete' is not needed since it already does a second get, and sets the // timestamp from get (see prepareDeleteTimestamps). } else { for (Mutation m : rowMutations.getMutations()) { if (m instanceof Put) { updateCellTimestamps(m.getFamilyCellMap().values(), byteTs); } } // And else 'delete' is not needed since it already does a second get, and sets the // timestamp from get (see prepareDeleteTimestamps). } // All edits for the given row (across all column families) must happen atomically. Result r; if (mutation != null) { r = mutate(mutation, true, nonceGroup, nonce).getResult(); } else { r = mutateRow(rowMutations, nonceGroup, nonce); } this.checkAndMutateChecksPassed.increment(); return new CheckAndMutateResult(true, r); } this.checkAndMutateChecksFailed.increment(); return new CheckAndMutateResult(false, null); } finally { rowLock.release(); } } finally { closeRegionOperation(); } } private void checkMutationType(final Mutation mutation) throws DoNotRetryIOException { if ( !(mutation instanceof Put) && !(mutation instanceof Delete) && !(mutation instanceof Increment) && !(mutation instanceof Append) ) { throw new org.apache.hadoop.hbase.DoNotRetryIOException( "Action must be Put or Delete or Increment or Delete"); } } private void checkRow(final Row action, final byte[] row) throws DoNotRetryIOException { if (!Bytes.equals(row, action.getRow())) { throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's getRow must match"); } } private boolean matches(final CompareOperator op, final int compareResult) { boolean matches = false; switch (op) { case LESS: matches = compareResult < 0; break; case LESS_OR_EQUAL: matches = compareResult <= 0; break; case EQUAL: matches = compareResult == 0; break; case NOT_EQUAL: matches = compareResult != 0; break; case GREATER_OR_EQUAL: matches = compareResult >= 0; break; case GREATER: matches = compareResult > 0; break; default: throw new RuntimeException("Unknown Compare op " + op.name()); } return matches; } private OperationStatus mutate(Mutation mutation) throws IOException { return mutate(mutation, false); } private OperationStatus mutate(Mutation mutation, boolean atomic) throws IOException { return mutate(mutation, atomic, HConstants.NO_NONCE, HConstants.NO_NONCE); } private OperationStatus mutate(Mutation mutation, boolean atomic, long nonceGroup, long nonce) throws IOException { OperationStatus[] status = this.batchMutate(new Mutation[] { mutation }, atomic, nonceGroup, nonce); if (status[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) { throw new FailedSanityCheckException(status[0].getExceptionMsg()); } else if (status[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) { throw new NoSuchColumnFamilyException(status[0].getExceptionMsg()); } else if (status[0].getOperationStatusCode().equals(OperationStatusCode.STORE_TOO_BUSY)) { throw new RegionTooBusyException(status[0].getExceptionMsg()); } return status[0]; } /** * Complete taking the snapshot on the region. Writes the region info and adds references to the * working snapshot directory. TODO for api consistency, consider adding another version with no * {@link ForeignExceptionSnare} arg. (In the future other cancellable HRegion methods could * eventually add a {@link ForeignExceptionSnare}, or we could do something fancier). * @param desc snapshot description object * @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to bail * out. This is allowed to be null and will just be ignored in that case. * @throws IOException if there is an external or internal error causing the snapshot to fail */ public void addRegionToSnapshot(SnapshotDescription desc, ForeignExceptionSnare exnSnare) throws IOException { Path rootDir = CommonFSUtils.getRootDir(conf); Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir, conf); SnapshotManifest manifest = SnapshotManifest.create(conf, getFilesystem(), snapshotDir, desc, exnSnare); manifest.addRegion(this); } private void updateSequenceId(final Iterable> cellItr, final long sequenceId) throws IOException { for (List cells : cellItr) { if (cells == null) return; for (Cell cell : cells) { PrivateCellUtil.setSequenceId(cell, sequenceId); } } } /** * Replace any cell timestamps set to {@link org.apache.hadoop.hbase.HConstants#LATEST_TIMESTAMP} * provided current timestamp. */ private static void updateCellTimestamps(final Iterable> cellItr, final byte[] now) throws IOException { for (List cells : cellItr) { if (cells == null) continue; // Optimization: 'foreach' loop is not used. See: // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects assert cells instanceof RandomAccess; int listSize = cells.size(); for (int i = 0; i < listSize; i++) { PrivateCellUtil.updateLatestStamp(cells.get(i), now); } } } /** * Possibly rewrite incoming cell tags. */ private void rewriteCellTags(Map> familyMap, final Mutation m) { // Check if we have any work to do and early out otherwise // Update these checks as more logic is added here if (m.getTTL() == Long.MAX_VALUE) { return; } // From this point we know we have some work to do for (Map.Entry> e : familyMap.entrySet()) { List cells = e.getValue(); assert cells instanceof RandomAccess; int listSize = cells.size(); for (int i = 0; i < listSize; i++) { Cell cell = cells.get(i); List newTags = TagUtil.carryForwardTags(null, cell); newTags = TagUtil.carryForwardTTLTag(newTags, m.getTTL()); // Rewrite the cell with the updated set of tags cells.set(i, PrivateCellUtil.createCell(cell, newTags)); } } } /** * Check if resources to support an update. *

* We throw RegionTooBusyException if above memstore limit and expect client to retry using some * kind of backoff */ private void checkResources() throws RegionTooBusyException { // If catalog region, do not impose resource constraints or block updates. if (this.getRegionInfo().isMetaRegion()) { return; } MemStoreSize mss = this.memStoreSizing.getMemStoreSize(); if (mss.getHeapSize() + mss.getOffHeapSize() > this.blockingMemStoreSize) { blockedRequestsCount.increment(); requestFlush(); // Don't print current limit because it will vary too much. The message is used as a key // over in RetriesExhaustedWithDetailsException processing. final String regionName = this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getEncodedName(); final String serverName = this.getRegionServerServices() == null ? "unknown" : (this.getRegionServerServices().getServerName() == null ? "unknown" : this.getRegionServerServices().getServerName().toString()); RegionTooBusyException rtbe = new RegionTooBusyException("Over memstore limit=" + org.apache.hadoop.hbase.procedure2.util.StringUtils.humanSize(this.blockingMemStoreSize) + ", regionName=" + regionName + ", server=" + serverName); LOG.warn("Region is too busy due to exceeding memstore size limit.", rtbe); throw rtbe; } } /** * @throws IOException Throws exception if region is in read-only mode. */ private void checkReadOnly() throws IOException { if (isReadOnly()) { throw new DoNotRetryIOException("region is read only"); } } private void checkReadsEnabled() throws IOException { if (!this.writestate.readsEnabled) { throw new IOException(getRegionInfo().getEncodedName() + ": The region's reads are disabled. Cannot serve the request"); } } public void setReadsEnabled(boolean readsEnabled) { if (readsEnabled && !this.writestate.readsEnabled) { LOG.info("Enabling reads for {}", getRegionInfo().getEncodedName()); } this.writestate.setReadsEnabled(readsEnabled); } /** * @param delta If we are doing delta changes -- e.g. increment/append -- then this flag will be * set; when set we will run operations that make sense in the increment/append * scenario but that do not make sense otherwise. * @see #applyToMemStore(HStore, Cell, MemStoreSizing) */ private void applyToMemStore(HStore store, List cells, boolean delta, MemStoreSizing memstoreAccounting) throws IOException { // Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!! boolean upsert = delta && store.getColumnFamilyDescriptor().getMaxVersions() == 1; if (upsert) { store.upsert(cells, getSmallestReadPoint(), memstoreAccounting); } else { store.add(cells, memstoreAccounting); } } /** * @see #applyToMemStore(HStore, List, boolean, MemStoreSizing) */ private void applyToMemStore(HStore store, Cell cell, MemStoreSizing memstoreAccounting) throws IOException { // Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!! if (store == null) { checkFamily(CellUtil.cloneFamily(cell)); // Unreachable because checkFamily will throw exception } store.add(cell, memstoreAccounting); } /** * Check the collection of families for validity. */ public void checkFamilies(Collection families) throws NoSuchColumnFamilyException { for (byte[] family : families) { checkFamily(family); } } /** * Check the collection of families for valid timestamps * @param now current timestamp */ public void checkTimestamps(final Map> familyMap, long now) throws FailedSanityCheckException { if (timestampSlop == HConstants.LATEST_TIMESTAMP) { return; } long maxTs = now + timestampSlop; for (List kvs : familyMap.values()) { // Optimization: 'foreach' loop is not used. See: // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects assert kvs instanceof RandomAccess; int listSize = kvs.size(); for (int i = 0; i < listSize; i++) { Cell cell = kvs.get(i); // see if the user-side TS is out of range. latest = server-side long ts = cell.getTimestamp(); if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) { throw new FailedSanityCheckException( "Timestamp for KV out of range " + cell + " (too.new=" + timestampSlop + ")"); } } } } /* * @return True if size is over the flush threshold */ private boolean isFlushSize(MemStoreSize size) { return size.getHeapSize() + size.getOffHeapSize() > getMemStoreFlushSize(); } private void deleteRecoveredEdits(FileSystem fs, Iterable files) throws IOException { for (Path file : files) { if (!fs.delete(file, false)) { LOG.error("Failed delete of {}", file); } else { LOG.debug("Deleted recovered.edits file={}", file); } } } /** * Read the edits put under this region by wal splitting process. Put the recovered edits back up * into this region. *

* We can ignore any wal message that has a sequence ID that's equal to or lower than minSeqId. * (Because we know such messages are already reflected in the HFiles.) *

* While this is running we are putting pressure on memory yet we are outside of our usual * accounting because we are not yet an onlined region (this stuff is being run as part of Region * initialization). This means that if we're up against global memory limits, we'll not be flagged * to flush because we are not online. We can't be flushed by usual mechanisms anyways; we're not * yet online so our relative sequenceids are not yet aligned with WAL sequenceids -- not till we * come up online, post processing of split edits. *

* But to help relieve memory pressure, at least manage our own heap size flushing if are in * excess of per-region limits. Flushing, though, we have to be careful and avoid using the * regionserver/wal sequenceid. Its running on a different line to whats going on in here in this * region context so if we crashed replaying these edits, but in the midst had a flush that used * the regionserver wal with a sequenceid in excess of whats going on in here in this region and * with its split editlogs, then we could miss edits the next time we go to recover. So, we have * to flush inline, using seqids that make sense in a this single region context only -- until we * online. * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of the maxSeqId * for the store to be applied, else its skipped. * @return the sequence id of the last edit added to this region out of the recovered edits log or * minSeqId if nothing added from editlogs. */ long replayRecoveredEditsIfAny(Map maxSeqIdInStores, final CancelableProgressable reporter, final MonitoredTask status) throws IOException { long minSeqIdForTheRegion = -1; for (Long maxSeqIdInStore : maxSeqIdInStores.values()) { if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) { minSeqIdForTheRegion = maxSeqIdInStore; } } long seqId = minSeqIdForTheRegion; String specialRecoveredEditsDirStr = conf.get(SPECIAL_RECOVERED_EDITS_DIR); if (org.apache.commons.lang3.StringUtils.isBlank(specialRecoveredEditsDirStr)) { FileSystem walFS = getWalFileSystem(); FileSystem rootFS = getFilesystem(); Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf, getRegionInfo().getTable(), getRegionInfo().getEncodedName()); Path regionWALDir = getWALRegionDir(); Path regionDir = FSUtils.getRegionDirFromRootDir(CommonFSUtils.getRootDir(conf), getRegionInfo()); // We made a mistake in HBASE-20734 so we need to do this dirty hack... NavigableSet filesUnderWrongRegionWALDir = WALSplitUtil.getSplitEditFilesSorted(walFS, wrongRegionWALDir); seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS, filesUnderWrongRegionWALDir, reporter, regionDir)); // This is to ensure backwards compatability with HBASE-20723 where recovered edits can appear // under the root dir even if walDir is set. NavigableSet filesUnderRootDir = Collections.emptyNavigableSet(); if (!regionWALDir.equals(regionDir)) { filesUnderRootDir = WALSplitUtil.getSplitEditFilesSorted(rootFS, regionDir); seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, rootFS, filesUnderRootDir, reporter, regionDir)); } NavigableSet files = WALSplitUtil.getSplitEditFilesSorted(walFS, regionWALDir); seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS, files, reporter, regionWALDir)); if (seqId > minSeqIdForTheRegion) { // Then we added some edits to memory. Flush and cleanup split edit files. internalFlushcache(null, seqId, stores.values(), status, false, FlushLifeCycleTracker.DUMMY); } // Now delete the content of recovered edits. We're done w/ them. if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) { // For debugging data loss issues! // If this flag is set, make use of the hfile archiving by making recovered.edits a fake // column family. Have to fake out file type too by casting our recovered.edits as // storefiles String fakeFamilyName = WALSplitUtil.getRegionDirRecoveredEditsDir(regionWALDir).getName(); Set fakeStoreFiles = new HashSet<>(files.size()); for (Path file : files) { fakeStoreFiles.add(new HStoreFile(walFS, file, this.conf, null, null, true)); } getRegionWALFileSystem().archiveRecoveredEdits(fakeFamilyName, fakeStoreFiles); } else { deleteRecoveredEdits(walFS, Iterables.concat(files, filesUnderWrongRegionWALDir)); deleteRecoveredEdits(rootFS, filesUnderRootDir); } } else { Path recoveredEditsDir = new Path(specialRecoveredEditsDirStr); FileSystem fs = recoveredEditsDir.getFileSystem(conf); FileStatus[] files = fs.listStatus(recoveredEditsDir); LOG.debug("Found {} recovered edits file(s) under {}", files == null ? 0 : files.length, recoveredEditsDir); if (files != null) { for (FileStatus file : files) { // it is safe to trust the zero-length in this case because we've been through rename and // lease recovery in the above. if (isZeroLengthThenDelete(fs, file, file.getPath())) { continue; } seqId = Math.max(seqId, replayRecoveredEdits(file.getPath(), maxSeqIdInStores, reporter, fs)); } } if (seqId > minSeqIdForTheRegion) { // Then we added some edits to memory. Flush and cleanup split edit files. internalFlushcache(null, seqId, stores.values(), status, false, FlushLifeCycleTracker.DUMMY); } deleteRecoveredEdits(fs, Stream.of(files).map(FileStatus::getPath).collect(Collectors.toList())); } return seqId; } private long replayRecoveredEditsForPaths(long minSeqIdForTheRegion, FileSystem fs, final NavigableSet files, final CancelableProgressable reporter, final Path regionDir) throws IOException { long seqid = minSeqIdForTheRegion; if (LOG.isDebugEnabled()) { LOG.debug("Found " + (files == null ? 0 : files.size()) + " recovered edits file(s) under " + regionDir); } if (files == null || files.isEmpty()) { return minSeqIdForTheRegion; } for (Path edits : files) { if (edits == null || !fs.exists(edits)) { LOG.warn("Null or non-existent edits file: " + edits); continue; } if (isZeroLengthThenDelete(fs, fs.getFileStatus(edits), edits)) { continue; } long maxSeqId; String fileName = edits.getName(); maxSeqId = Math.abs(Long.parseLong(fileName)); if (maxSeqId <= minSeqIdForTheRegion) { if (LOG.isDebugEnabled()) { String msg = "Maximum sequenceid for this wal is " + maxSeqId + " and minimum sequenceid for the region " + this + " is " + minSeqIdForTheRegion + ", skipped the whole file, path=" + edits; LOG.debug(msg); } continue; } try { // replay the edits. Replay can return -1 if everything is skipped, only update // if seqId is greater seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter, fs)); } catch (IOException e) { handleException(fs, edits, e); } } return seqid; } private void handleException(FileSystem fs, Path edits, IOException e) throws IOException { boolean skipErrors = conf.getBoolean(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS, conf.getBoolean("hbase.skip.errors", HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS)); if (conf.get("hbase.skip.errors") != null) { LOG.warn("The property 'hbase.skip.errors' has been deprecated. Please use " + HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead."); } if (skipErrors) { Path p = WALSplitUtil.moveAsideBadEditsFile(fs, edits); LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + "=true so continuing. Renamed " + edits + " as " + p, e); } else { throw e; } } /** * @param edits File of recovered edits. * @param maxSeqIdInStores Maximum sequenceid found in each store. Edits in wal must be larger * than this to be replayed for each store. * @return the sequence id of the last edit added to this region out of the recovered edits log or * minSeqId if nothing added from editlogs. */ private long replayRecoveredEdits(final Path edits, Map maxSeqIdInStores, final CancelableProgressable reporter, FileSystem fs) throws IOException { String msg = "Replaying edits from " + edits; LOG.info(msg); MonitoredTask status = TaskMonitor.get().createStatus(msg); status.setStatus("Opening recovered edits"); WAL.Reader reader = null; try { reader = WALFactory.createReader(fs, edits, conf); long currentEditSeqId = -1; long currentReplaySeqId = -1; long firstSeqIdInLog = -1; long skippedEdits = 0; long editsCount = 0; long intervalEdits = 0; WAL.Entry entry; HStore store = null; boolean reported_once = false; ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager(); try { // How many edits seen before we check elapsed time int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000); // How often to send a progress report (default 1/2 master timeout) int period = this.conf.getInt("hbase.hstore.report.period", 300000); long lastReport = EnvironmentEdgeManager.currentTime(); if (coprocessorHost != null) { coprocessorHost.preReplayWALs(this.getRegionInfo(), edits); } while ((entry = reader.next()) != null) { WALKey key = entry.getKey(); WALEdit val = entry.getEdit(); if (ng != null) { // some test, or nonces disabled ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime()); } if (reporter != null) { intervalEdits += val.size(); if (intervalEdits >= interval) { // Number of edits interval reached intervalEdits = 0; long cur = EnvironmentEdgeManager.currentTime(); if (lastReport + period <= cur) { status.setStatus( "Replaying edits..." + " skipped=" + skippedEdits + " edits=" + editsCount); // Timeout reached if (!reporter.progress()) { msg = "Progressable reporter failed, stopping replay for region " + this; LOG.warn(msg); status.abort(msg); throw new IOException(msg); } reported_once = true; lastReport = cur; } } } if (firstSeqIdInLog == -1) { firstSeqIdInLog = key.getSequenceId(); } if (currentEditSeqId > key.getSequenceId()) { // when this condition is true, it means we have a serious defect because we need to // maintain increasing SeqId for WAL edits per region LOG.error(getRegionInfo().getEncodedName() + " : " + "Found decreasing SeqId. PreId=" + currentEditSeqId + " key=" + key + "; edit=" + val); } else { currentEditSeqId = key.getSequenceId(); } currentReplaySeqId = (key.getOrigLogSeqNum() > 0) ? key.getOrigLogSeqNum() : currentEditSeqId; // Start coprocessor replay here. The coprocessor is for each WALEdit // instead of a KeyValue. if (coprocessorHost != null) { status.setStatus("Running pre-WAL-restore hook in coprocessors"); if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) { // if bypass this wal entry, ignore it ... continue; } } boolean checkRowWithinBoundary = false; // Check this edit is for this region. if ( !Bytes.equals(key.getEncodedRegionName(), this.getRegionInfo().getEncodedNameAsBytes()) ) { checkRowWithinBoundary = true; } boolean flush = false; MemStoreSizing memStoreSizing = new NonThreadSafeMemStoreSizing(); for (Cell cell : val.getCells()) { // Check this edit is for me. Also, guard against writing the special // METACOLUMN info such as HBASE::CACHEFLUSH entries if (WALEdit.isMetaEditFamily(cell)) { // if region names don't match, skipp replaying compaction marker if (!checkRowWithinBoundary) { // this is a special edit, we should handle it CompactionDescriptor compaction = WALEdit.getCompaction(cell); if (compaction != null) { // replay the compaction replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE); } } skippedEdits++; continue; } // Figure which store the edit is meant for. if ( store == null || !CellUtil.matchingFamily(cell, store.getColumnFamilyDescriptor().getName()) ) { store = getStore(cell); } if (store == null) { // This should never happen. Perhaps schema was changed between // crash and redeploy? LOG.warn("No family for cell {} in region {}", cell, this); skippedEdits++; continue; } if ( checkRowWithinBoundary && !rowIsInRange(this.getRegionInfo(), cell.getRowArray(), cell.getRowOffset(), cell.getRowLength()) ) { LOG.warn("Row of {} is not within region boundary for region {}", cell, this); skippedEdits++; continue; } // Now, figure if we should skip this edit. if ( key.getSequenceId() <= maxSeqIdInStores.get(store.getColumnFamilyDescriptor().getName()) ) { skippedEdits++; continue; } PrivateCellUtil.setSequenceId(cell, currentReplaySeqId); restoreEdit(store, cell, memStoreSizing); editsCount++; } MemStoreSize mss = memStoreSizing.getMemStoreSize(); incMemStoreSize(mss); flush = isFlushSize(this.memStoreSizing.getMemStoreSize()); if (flush) { internalFlushcache(null, currentEditSeqId, stores.values(), status, false, FlushLifeCycleTracker.DUMMY); } if (coprocessorHost != null) { coprocessorHost.postWALRestore(this.getRegionInfo(), key, val); } } if (coprocessorHost != null) { coprocessorHost.postReplayWALs(this.getRegionInfo(), edits); } } catch (EOFException eof) { Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits); msg = "EnLongAddered EOF. Most likely due to Master failure during " + "wal splitting, so we have this data in another edit. Continuing, but renaming " + edits + " as " + p + " for region " + this; LOG.warn(msg, eof); status.abort(msg); } catch (IOException ioe) { // If the IOE resulted from bad file format, // then this problem is idempotent and retrying won't help if (ioe.getCause() instanceof ParseException) { Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits); msg = "File corruption enLongAddered! " + "Continuing, but renaming " + edits + " as " + p; LOG.warn(msg, ioe); status.setStatus(msg); } else { status.abort(StringUtils.stringifyException(ioe)); // other IO errors may be transient (bad network connection, // checksum exception on one datanode, etc). throw & retry throw ioe; } } if (reporter != null && !reported_once) { reporter.progress(); } msg = "Applied " + editsCount + ", skipped " + skippedEdits + ", firstSequenceIdInLog=" + firstSeqIdInLog + ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits; status.markComplete(msg); LOG.debug(msg); return currentEditSeqId; } finally { status.cleanup(); if (reader != null) { reader.close(); } } } /** * Call to complete a compaction. Its for the case where we find in the WAL a compaction that was * not finished. We could find one recovering a WAL after a regionserver crash. See HBASE-2331. */ void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles, boolean removeFiles, long replaySeqId) throws IOException { try { checkTargetRegion(compaction.getEncodedRegionName().toByteArray(), "Compaction marker from WAL ", compaction); } catch (WrongRegionException wre) { if (RegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { // skip the compaction marker since it is not for this region return; } throw wre; } synchronized (writestate) { if (replaySeqId < lastReplayedOpenRegionSeqId) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction) + " because its sequence id " + replaySeqId + " is smaller than this regions " + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId); return; } if (replaySeqId < lastReplayedCompactionSeqId) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction) + " because its sequence id " + replaySeqId + " is smaller than this regions " + "lastReplayedCompactionSeqId of " + lastReplayedCompactionSeqId); return; } else { lastReplayedCompactionSeqId = replaySeqId; } if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying compaction marker " + TextFormat.shortDebugString(compaction) + " with seqId=" + replaySeqId + " and lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId); } startRegionOperation(Operation.REPLAY_EVENT); try { HStore store = this.getStore(compaction.getFamilyName().toByteArray()); if (store == null) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Found Compaction WAL edit for deleted family:" + Bytes.toString(compaction.getFamilyName().toByteArray())); return; } store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles); logRegionFiles(); } catch (FileNotFoundException ex) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "At least one of the store files in compaction: " + TextFormat.shortDebugString(compaction) + " doesn't exist any more. Skip loading the file(s)", ex); } finally { closeRegionOperation(Operation.REPLAY_EVENT); } } } void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException { checkTargetRegion(flush.getEncodedRegionName().toByteArray(), "Flush marker from WAL ", flush); if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { return; // if primary nothing to do } if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying flush marker " + TextFormat.shortDebugString(flush)); } startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close try { FlushAction action = flush.getAction(); switch (action) { case START_FLUSH: replayWALFlushStartMarker(flush); break; case COMMIT_FLUSH: replayWALFlushCommitMarker(flush); break; case ABORT_FLUSH: replayWALFlushAbortMarker(flush); break; case CANNOT_FLUSH: replayWALFlushCannotFlushMarker(flush, replaySeqId); break; default: LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush event with unknown action, ignoring. " + TextFormat.shortDebugString(flush)); break; } logRegionFiles(); } finally { closeRegionOperation(Operation.REPLAY_EVENT); } } /** * Replay the flush marker from primary region by creating a corresponding snapshot of the store * memstores, only if the memstores do not have a higher seqId from an earlier wal edit (because * the events may be coming out of order). */ PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException { long flushSeqId = flush.getFlushSequenceNumber(); HashSet storesToFlush = new HashSet<>(); for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) { byte[] family = storeFlush.getFamilyName().toByteArray(); HStore store = getStore(family); if (store == null) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush start marker from primary, but the family is not found. Ignoring" + " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush)); continue; } storesToFlush.add(store); } MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this); // we will use writestate as a coarse-grain lock for all the replay events // (flush, compaction, region open etc) synchronized (writestate) { try { if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush) + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " + " of " + lastReplayedOpenRegionSeqId); return null; } if (numMutationsWithoutWAL.sum() > 0) { numMutationsWithoutWAL.reset(); dataInMemoryWithoutWAL.reset(); } if (!writestate.flushing) { // we do not have an active snapshot and corresponding this.prepareResult. This means // we can just snapshot our memstores and continue as normal. // invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal PrepareFlushResult prepareResult = internalPrepareFlushCache(null, flushSeqId, storesToFlush, status, false, FlushLifeCycleTracker.DUMMY); if (prepareResult.result == null) { // save the PrepareFlushResult so that we can use it later from commit flush this.writestate.flushing = true; this.prepareFlushResult = prepareResult; status.markComplete("Flush prepare successful"); if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + " Prepared flush with seqId:" + flush.getFlushSequenceNumber()); } } else { // special case empty memstore. We will still save the flush result in this case, since // our memstore ie empty, but the primary is still flushing if ( prepareResult.getResult().getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY ) { this.writestate.flushing = true; this.prepareFlushResult = prepareResult; if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber()); } } status.abort("Flush prepare failed with " + prepareResult.result); // nothing much to do. prepare flush failed because of some reason. } return prepareResult; } else { // we already have an active snapshot. if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) { // They define the same flush. Log and continue. LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush prepare marker with the same seqId: " + +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " + prepareFlushResult.flushOpSeqId + ". Ignoring"); // ignore } else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) { // We received a flush with a smaller seqNum than what we have prepared. We can only // ignore this prepare flush request. LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush prepare marker with a smaller seqId: " + +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " + prepareFlushResult.flushOpSeqId + ". Ignoring"); // ignore } else { // We received a flush with a larger seqNum than what we have prepared LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush prepare marker with a larger seqId: " + +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " + prepareFlushResult.flushOpSeqId + ". Ignoring"); // We do not have multiple active snapshots in the memstore or a way to merge current // memstore snapshot with the contents and resnapshot for now. We cannot take // another snapshot and drop the previous one because that will cause temporary // data loss in the secondary. So we ignore this for now, deferring the resolution // to happen when we see the corresponding flush commit marker. If we have a memstore // snapshot with x, and later received another prepare snapshot with y (where x < y), // when we see flush commit for y, we will drop snapshot for x, and can also drop all // the memstore edits if everything in memstore is < y. This is the usual case for // RS crash + recovery where we might see consequtive prepare flush wal markers. // Otherwise, this will cause more memory to be used in secondary replica until a // further prapare + commit flush is seen and replayed. } } } finally { status.cleanup(); writestate.notifyAll(); } } return null; } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", justification = "Intentional; post memstore flush") void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException { MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this); // check whether we have the memstore snapshot with the corresponding seqId. Replay to // secondary region replicas are in order, except for when the region moves or then the // region server crashes. In those cases, we may receive replay requests out of order from // the original seqIds. synchronized (writestate) { try { if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush) + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " + " of " + lastReplayedOpenRegionSeqId); return; } if (writestate.flushing) { PrepareFlushResult prepareFlushResult = this.prepareFlushResult; if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) { if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber() + " and a previous prepared snapshot was found"); } // This is the regular case where we received commit flush after prepare flush // corresponding to the same seqId. replayFlushInStores(flush, prepareFlushResult, true); // Set down the memstore size by amount of flush. this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize()); this.prepareFlushResult = null; writestate.flushing = false; } else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) { // This should not happen normally. However, lets be safe and guard against these cases // we received a flush commit with a smaller seqId than what we have prepared // we will pick the flush file up from this commit (if we have not seen it), but we // will not drop the memstore LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker with smaller seqId: " + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " + prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping" + " prepared memstore snapshot"); replayFlushInStores(flush, prepareFlushResult, false); // snapshot is not dropped, so memstore sizes should not be decremented // we still have the prepared snapshot, flushing should still be true } else { // This should not happen normally. However, lets be safe and guard against these cases // we received a flush commit with a larger seqId than what we have prepared // we will pick the flush file for this. We will also obtain the updates lock and // look for contents of the memstore to see whether we have edits after this seqId. // If not, we will drop all the memstore edits and the snapshot as well. LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker with larger seqId: " + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " + prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared" + " memstore snapshot"); replayFlushInStores(flush, prepareFlushResult, true); // Set down the memstore size by amount of flush. this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize()); // Inspect the memstore contents to see whether the memstore contains only edits // with seqId smaller than the flush seqId. If so, we can discard those edits. dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null); this.prepareFlushResult = null; writestate.flushing = false; } // If we were waiting for observing a flush or region opening event for not showing // partial data after a secondary region crash, we can allow reads now. We can only make // sure that we are not showing partial data (for example skipping some previous edits) // until we observe a full flush start and flush commit. So if we were not able to find // a previous flush we will not enable reads now. this.setReadsEnabled(true); } else { LOG.warn( getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber() + ", but no previous prepared snapshot was found"); // There is no corresponding prepare snapshot from before. // We will pick up the new flushed file replayFlushInStores(flush, null, false); // Inspect the memstore contents to see whether the memstore contains only edits // with seqId smaller than the flush seqId. If so, we can discard those edits. dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null); } status.markComplete("Flush commit successful"); // Update the last flushed sequence id for region. this.maxFlushedSeqId = flush.getFlushSequenceNumber(); // advance the mvcc read point so that the new flushed file is visible. mvcc.advanceTo(flush.getFlushSequenceNumber()); } catch (FileNotFoundException ex) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "At least one of the store files in flush: " + TextFormat.shortDebugString(flush) + " doesn't exist any more. Skip loading the file(s)", ex); } finally { status.cleanup(); writestate.notifyAll(); } } // C. Finally notify anyone waiting on memstore to clear: // e.g. checkResources(). synchronized (this) { notifyAll(); // FindBugs NN_NAKED_NOTIFY } } /** * Replays the given flush descriptor by opening the flush files in stores and dropping the * memstore snapshots if requested. */ private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult, boolean dropMemstoreSnapshot) throws IOException { for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) { byte[] family = storeFlush.getFamilyName().toByteArray(); HStore store = getStore(family); if (store == null) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker from primary, but the family is not found." + "Ignoring StoreFlushDescriptor:" + storeFlush); continue; } List flushFiles = storeFlush.getFlushOutputList(); StoreFlushContext ctx = null; long startTime = EnvironmentEdgeManager.currentTime(); if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) { ctx = store.createFlushContext(flush.getFlushSequenceNumber(), FlushLifeCycleTracker.DUMMY); } else { ctx = prepareFlushResult.storeFlushCtxs.get(family); startTime = prepareFlushResult.startTime; } if (ctx == null) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Unexpected: flush commit marker received from store " + Bytes.toString(family) + " but no associated flush context. Ignoring"); continue; } ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush // Record latest flush time this.lastStoreFlushTimeMap.put(store, startTime); } } private long loadRecoveredHFilesIfAny(Collection stores) throws IOException { Path regionDir = fs.getRegionDir(); long maxSeqId = -1; for (HStore store : stores) { String familyName = store.getColumnFamilyName(); FileStatus[] files = WALSplitUtil.getRecoveredHFiles(fs.getFileSystem(), regionDir, familyName); if (files != null && files.length != 0) { for (FileStatus file : files) { Path filePath = file.getPath(); // If file length is zero then delete it if (isZeroLengthThenDelete(fs.getFileSystem(), file, filePath)) { continue; } try { HStoreFile storefile = store.tryCommitRecoveredHFile(file.getPath()); maxSeqId = Math.max(maxSeqId, storefile.getReader().getSequenceID()); } catch (IOException e) { handleException(fs.getFileSystem(), filePath, e); continue; } } if (this.rsServices != null && store.needsCompaction()) { this.rsServices.getCompactionRequestor().requestCompaction(this, store, "load recovered hfiles request compaction", Store.PRIORITY_USER + 1, CompactionLifeCycleTracker.DUMMY, null); } } } return maxSeqId; } /** * Be careful, this method will drop all data in the memstore of this region. Currently, this * method is used to drop memstore to prevent memory leak when replaying recovered.edits while * opening region. */ private MemStoreSize dropMemStoreContents() throws IOException { MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing(); this.updatesLock.writeLock().lock(); try { for (HStore s : stores.values()) { MemStoreSize memStoreSize = doDropStoreMemStoreContentsForSeqId(s, HConstants.NO_SEQNUM); LOG.info("Drop memstore for Store " + s.getColumnFamilyName() + " in region " + this.getRegionInfo().getRegionNameAsString() + " , dropped memstoresize: [" + memStoreSize + " }"); totalFreedSize.incMemStoreSize(memStoreSize); } return totalFreedSize.getMemStoreSize(); } finally { this.updatesLock.writeLock().unlock(); } } /** * Drops the memstore contents after replaying a flush descriptor or region open event replay if * the memstore edits have seqNums smaller than the given seq id */ private MemStoreSize dropMemStoreContentsForSeqId(long seqId, HStore store) throws IOException { MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing(); this.updatesLock.writeLock().lock(); try { long currentSeqId = mvcc.getReadPoint(); if (seqId >= currentSeqId) { // then we can drop the memstore contents since everything is below this seqId LOG.info(getRegionInfo().getEncodedName() + " : " + "Dropping memstore contents as well since replayed flush seqId: " + seqId + " is greater than current seqId:" + currentSeqId); // Prepare flush (take a snapshot) and then abort (drop the snapshot) if (store == null) { for (HStore s : stores.values()) { totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(s, currentSeqId)); } } else { totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(store, currentSeqId)); } } else { LOG.info(getRegionInfo().getEncodedName() + " : " + "Not dropping memstore contents since replayed flush seqId: " + seqId + " is smaller than current seqId:" + currentSeqId); } } finally { this.updatesLock.writeLock().unlock(); } return totalFreedSize.getMemStoreSize(); } private MemStoreSize doDropStoreMemStoreContentsForSeqId(HStore s, long currentSeqId) throws IOException { MemStoreSize flushableSize = s.getFlushableSize(); this.decrMemStoreSize(flushableSize); StoreFlushContext ctx = s.createFlushContext(currentSeqId, FlushLifeCycleTracker.DUMMY); ctx.prepare(); ctx.abort(); return flushableSize; } private void replayWALFlushAbortMarker(FlushDescriptor flush) { // nothing to do for now. A flush abort will cause a RS abort which means that the region // will be opened somewhere else later. We will see the region open event soon, and replaying // that will drop the snapshot } private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) { synchronized (writestate) { if (this.lastReplayedOpenRegionSeqId > replaySeqId) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush) + " because its sequence id " + replaySeqId + " is smaller than this regions " + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId); return; } // If we were waiting for observing a flush or region opening event for not showing partial // data after a secondary region crash, we can allow reads now. This event means that the // primary was not able to flush because memstore is empty when we requested flush. By the // time we observe this, we are guaranteed to have up to date seqId with our previous // assignment. this.setReadsEnabled(true); } } PrepareFlushResult getPrepareFlushResult() { return prepareFlushResult; } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", justification = "Intentional; cleared the memstore") void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException { checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(), "RegionEvent marker from WAL ", regionEvent); startRegionOperation(Operation.REPLAY_EVENT); try { if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { return; // if primary nothing to do } if (regionEvent.getEventType() == EventType.REGION_CLOSE) { // nothing to do on REGION_CLOSE for now. return; } if (regionEvent.getEventType() != EventType.REGION_OPEN) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Unknown region event received, ignoring :" + TextFormat.shortDebugString(regionEvent)); return; } if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying region open event marker " + TextFormat.shortDebugString(regionEvent)); } // we will use writestate as a coarse-grain lock for all the replay events synchronized (writestate) { // Replication can deliver events out of order when primary region moves or the region // server crashes, since there is no coordination between replication of different wal files // belonging to different region servers. We have to safe guard against this case by using // region open event's seqid. Since this is the first event that the region puts (after // possibly flushing recovered.edits), after seeing this event, we can ignore every edit // smaller than this seqId if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) { this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber(); } else { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying region event :" + TextFormat.shortDebugString(regionEvent) + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " + " of " + lastReplayedOpenRegionSeqId); return; } // region open lists all the files that the region has at the time of the opening. Just pick // all the files and drop prepared flushes and empty memstores for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) { // stores of primary may be different now byte[] family = storeDescriptor.getFamilyName().toByteArray(); HStore store = getStore(family); if (store == null) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a region open marker from primary, but the family is not found. " + "Ignoring. StoreDescriptor:" + storeDescriptor); continue; } long storeSeqId = store.getMaxSequenceId().orElse(0L); List storeFiles = storeDescriptor.getStoreFileList(); try { store.refreshStoreFiles(storeFiles); // replace the files with the new ones } catch (FileNotFoundException ex) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "At least one of the store files: " + storeFiles + " doesn't exist any more. Skip loading the file(s)", ex); continue; } if (store.getMaxSequenceId().orElse(0L) != storeSeqId) { // Record latest flush time if we picked up new files lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime()); } if (writestate.flushing) { // only drop memstore snapshots if they are smaller than last flush for the store if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) { StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ? null : this.prepareFlushResult.storeFlushCtxs.get(family); if (ctx != null) { MemStoreSize mss = store.getFlushableSize(); ctx.abort(); this.decrMemStoreSize(mss); this.prepareFlushResult.storeFlushCtxs.remove(family); } } } // Drop the memstore contents if they are now smaller than the latest seen flushed file dropMemStoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store); if (storeSeqId > this.maxFlushedSeqId) { this.maxFlushedSeqId = storeSeqId; } } // if all stores ended up dropping their snapshots, we can safely drop the // prepareFlushResult dropPrepareFlushIfPossible(); // advance the mvcc read point so that the new flushed file is visible. mvcc.await(); // If we were waiting for observing a flush or region opening event for not showing partial // data after a secondary region crash, we can allow reads now. this.setReadsEnabled(true); // C. Finally notify anyone waiting on memstore to clear: // e.g. checkResources(). synchronized (this) { notifyAll(); // FindBugs NN_NAKED_NOTIFY } } logRegionFiles(); } finally { closeRegionOperation(Operation.REPLAY_EVENT); } } void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException { checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(), "BulkLoad marker from WAL ", bulkLoadEvent); if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { return; // if primary nothing to do } if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying bulkload event marker " + TextFormat.shortDebugString(bulkLoadEvent)); } // check if multiple families involved boolean multipleFamilies = false; byte[] family = null; for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) { byte[] fam = storeDescriptor.getFamilyName().toByteArray(); if (family == null) { family = fam; } else if (!Bytes.equals(family, fam)) { multipleFamilies = true; break; } } startBulkRegionOperation(multipleFamilies); try { // we will use writestate as a coarse-grain lock for all the replay events synchronized (writestate) { // Replication can deliver events out of order when primary region moves or the region // server crashes, since there is no coordination between replication of different wal files // belonging to different region servers. We have to safe guard against this case by using // region open event's seqid. Since this is the first event that the region puts (after // possibly flushing recovered.edits), after seeing this event, we can ignore every edit // smaller than this seqId if ( bulkLoadEvent.getBulkloadSeqNum() >= 0 && this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum() ) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying bulkload event :" + TextFormat.shortDebugString(bulkLoadEvent) + " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId" + " =" + lastReplayedOpenRegionSeqId); return; } for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) { // stores of primary may be different now family = storeDescriptor.getFamilyName().toByteArray(); HStore store = getStore(family); if (store == null) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a bulk load marker from primary, but the family is not found. " + "Ignoring. StoreDescriptor:" + storeDescriptor); continue; } List storeFiles = storeDescriptor.getStoreFileList(); for (String storeFile : storeFiles) { StoreFileInfo storeFileInfo = null; try { storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile); store.bulkLoadHFile(storeFileInfo); } catch (FileNotFoundException ex) { LOG.warn(getRegionInfo().getEncodedName() + " : " + ((storeFileInfo != null) ? storeFileInfo.toString() : (new Path(Bytes.toString(family), storeFile)).toString()) + " doesn't exist any more. Skip loading the file"); } } } } if (bulkLoadEvent.getBulkloadSeqNum() > 0) { mvcc.advanceTo(bulkLoadEvent.getBulkloadSeqNum()); } } finally { closeBulkRegionOperation(); } } /** * If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult */ private void dropPrepareFlushIfPossible() { if (writestate.flushing) { boolean canDrop = true; if (prepareFlushResult.storeFlushCtxs != null) { for (Entry entry : prepareFlushResult.storeFlushCtxs .entrySet()) { HStore store = getStore(entry.getKey()); if (store == null) { continue; } if (store.getSnapshotSize().getDataSize() > 0) { canDrop = false; break; } } } // this means that all the stores in the region has finished flushing, but the WAL marker // may not have been written or we did not receive it yet. if (canDrop) { writestate.flushing = false; this.prepareFlushResult = null; } } } @Override public boolean refreshStoreFiles() throws IOException { return refreshStoreFiles(false); } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", justification = "Notify is about post replay. Intentional") protected boolean refreshStoreFiles(boolean force) throws IOException { if (!force && ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { return false; // if primary nothing to do } if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + "Refreshing store files to see whether we can free up memstore"); } long totalFreedDataSize = 0; long smallestSeqIdInStores = Long.MAX_VALUE; startRegionOperation(); // obtain region close lock try { Map map = new HashMap<>(); synchronized (writestate) { for (HStore store : stores.values()) { // TODO: some stores might see new data from flush, while others do not which // MIGHT break atomic edits across column families. long maxSeqIdBefore = store.getMaxSequenceId().orElse(0L); // refresh the store files. This is similar to observing a region open wal marker. store.refreshStoreFiles(); long storeSeqId = store.getMaxSequenceId().orElse(0L); if (storeSeqId < smallestSeqIdInStores) { smallestSeqIdInStores = storeSeqId; } // see whether we can drop the memstore or the snapshot if (storeSeqId > maxSeqIdBefore) { if (writestate.flushing) { // only drop memstore snapshots if they are smaller than last flush for the store if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) { StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ? null : this.prepareFlushResult.storeFlushCtxs .get(store.getColumnFamilyDescriptor().getName()); if (ctx != null) { MemStoreSize mss = store.getFlushableSize(); ctx.abort(); this.decrMemStoreSize(mss); this.prepareFlushResult.storeFlushCtxs .remove(store.getColumnFamilyDescriptor().getName()); totalFreedDataSize += mss.getDataSize(); } } } map.put(store, storeSeqId); } } // if all stores ended up dropping their snapshots, we can safely drop the // prepareFlushResult dropPrepareFlushIfPossible(); // advance the mvcc read point so that the new flushed files are visible. // either greater than flush seq number or they were already picked up via flush. for (HStore s : stores.values()) { mvcc.advanceTo(s.getMaxMemStoreTS().orElse(0L)); } // smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely // skip all edits that are to be replayed in the future with that has a smaller seqId // than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits // that we have picked the flush files for if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) { this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores; } } if (!map.isEmpty()) { for (Map.Entry entry : map.entrySet()) { // Drop the memstore contents if they are now smaller than the latest seen flushed file totalFreedDataSize += dropMemStoreContentsForSeqId(entry.getValue(), entry.getKey()).getDataSize(); } } // C. Finally notify anyone waiting on memstore to clear: // e.g. checkResources(). synchronized (this) { notifyAll(); // FindBugs NN_NAKED_NOTIFY } return totalFreedDataSize > 0; } finally { closeRegionOperation(); } } private void logRegionFiles() { if (LOG.isTraceEnabled()) { LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: "); stores.values().stream().filter(s -> s.getStorefiles() != null) .flatMap(s -> s.getStorefiles().stream()) .forEachOrdered(sf -> LOG.trace(getRegionInfo().getEncodedName() + " : " + sf)); } } /** * Checks whether the given regionName is either equal to our region, or that the regionName is * the primary region to our corresponding range for the secondary replica. */ private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload) throws WrongRegionException { if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) { return; } if ( !RegionReplicaUtil.isDefaultReplica(this.getRegionInfo()) && Bytes.equals(encodedRegionName, this.fs.getRegionInfoForFS().getEncodedNameAsBytes()) ) { return; } throw new WrongRegionException( exceptionMsg + payload + " targetted for region " + Bytes.toStringBinary(encodedRegionName) + " does not match this region: " + this.getRegionInfo()); } /** * Used by tests * @param s Store to add edit too. * @param cell Cell to add. */ protected void restoreEdit(HStore s, Cell cell, MemStoreSizing memstoreAccounting) { s.add(cell, memstoreAccounting); } /** * make sure have been through lease recovery before get file status, so the file length can be * trusted. * @param p File to check. * @return True if file was zero-length (and if so, we'll delete it in here). */ private static boolean isZeroLengthThenDelete(final FileSystem fs, final FileStatus stat, final Path p) throws IOException { if (stat.getLen() > 0) { return false; } LOG.warn("File " + p + " is zero-length, deleting."); fs.delete(p, false); return true; } protected HStore instantiateHStore(final ColumnFamilyDescriptor family, boolean warmup) throws IOException { if (family.isMobEnabled()) { if (HFile.getFormatVersion(this.conf) < HFile.MIN_FORMAT_VERSION_WITH_TAGS) { throw new IOException("A minimum HFile version of " + HFile.MIN_FORMAT_VERSION_WITH_TAGS + " is required for MOB feature. Consider setting " + HFile.FORMAT_VERSION_KEY + " accordingly."); } return new HMobStore(this, family, this.conf, warmup); } return new HStore(this, family, this.conf, warmup); } @Override public HStore getStore(byte[] column) { return this.stores.get(column); } /** * Return HStore instance. Does not do any copy: as the number of store is limited, we iterate on * the list. */ private HStore getStore(Cell cell) { return stores.entrySet().stream().filter(e -> CellUtil.matchingFamily(cell, e.getKey())) .map(e -> e.getValue()).findFirst().orElse(null); } @Override public List getStores() { return new ArrayList<>(stores.values()); } @Override public List getStoreFileList(byte[][] columns) throws IllegalArgumentException { List storeFileNames = new ArrayList<>(); synchronized (closeLock) { for (byte[] column : columns) { HStore store = this.stores.get(column); if (store == null) { throw new IllegalArgumentException( "No column family : " + new String(column, StandardCharsets.UTF_8) + " available"); } Collection storeFiles = store.getStorefiles(); if (storeFiles == null) { continue; } for (HStoreFile storeFile : storeFiles) { storeFileNames.add(storeFile.getPath().toString()); } logRegionFiles(); } } return storeFileNames; } ////////////////////////////////////////////////////////////////////////////// // Support code ////////////////////////////////////////////////////////////////////////////// /** Make sure this is a valid row for the HRegion */ void checkRow(byte[] row, String op) throws IOException { if (!rowIsInRange(getRegionInfo(), row)) { throw new WrongRegionException("Requested row out of range for " + op + " on HRegion " + this + ", startKey='" + Bytes.toStringBinary(getRegionInfo().getStartKey()) + "', getEndKey()='" + Bytes.toStringBinary(getRegionInfo().getEndKey()) + "', row='" + Bytes.toStringBinary(row) + "'"); } } /** * Get an exclusive ( write lock ) lock on a given row. * @param row Which row to lock. * @return A locked RowLock. The lock is exclusive and already aqquired. */ public RowLock getRowLock(byte[] row) throws IOException { return getRowLock(row, false); } @Override public RowLock getRowLock(byte[] row, boolean readLock) throws IOException { checkRow(row, "row lock"); return getRowLock(row, readLock, null); } Span createRegionSpan(String name) { return TraceUtil.createSpan(name).setAttribute(REGION_NAMES_KEY, Collections.singletonList(getRegionInfo().getRegionNameAsString())); } // will be override in tests protected RowLock getRowLockInternal(byte[] row, boolean readLock, RowLock prevRowLock) throws IOException { // create an object to use a a key in the row lock map HashedBytes rowKey = new HashedBytes(row); RowLockContext rowLockContext = null; RowLockImpl result = null; boolean success = false; try { // Keep trying until we have a lock or error out. // TODO: do we need to add a time component here? while (result == null) { rowLockContext = computeIfAbsent(lockedRows, rowKey, () -> new RowLockContext(rowKey)); // Now try an get the lock. // This can fail as if (readLock) { // For read lock, if the caller has locked the same row previously, it will not try // to acquire the same read lock. It simply returns the previous row lock. RowLockImpl prevRowLockImpl = (RowLockImpl) prevRowLock; if ( (prevRowLockImpl != null) && (prevRowLockImpl.getLock() == rowLockContext.readWriteLock.readLock()) ) { success = true; return prevRowLock; } result = rowLockContext.newReadLock(); } else { result = rowLockContext.newWriteLock(); } } int timeout = rowLockWaitDuration; boolean reachDeadlineFirst = false; Optional call = RpcServer.getCurrentCall(); if (call.isPresent()) { long deadline = call.get().getDeadline(); if (deadline < Long.MAX_VALUE) { int timeToDeadline = (int) (deadline - EnvironmentEdgeManager.currentTime()); if (timeToDeadline <= this.rowLockWaitDuration) { reachDeadlineFirst = true; timeout = timeToDeadline; } } } if (timeout <= 0 || !result.getLock().tryLock(timeout, TimeUnit.MILLISECONDS)) { String message = "Timed out waiting for lock for row: " + rowKey + " in region " + getRegionInfo().getEncodedName(); if (reachDeadlineFirst) { throw new TimeoutIOException(message); } else { // If timeToDeadline is larger than rowLockWaitDuration, we can not drop the request. throw new IOException(message); } } rowLockContext.setThreadName(Thread.currentThread().getName()); success = true; return result; } catch (InterruptedException ie) { if (LOG.isDebugEnabled()) { LOG.debug("Thread interrupted waiting for lock on row: {}, in region {}", rowKey, getRegionInfo().getRegionNameAsString()); } throw throwOnInterrupt(ie); } catch (Error error) { // The maximum lock count for read lock is 64K (hardcoded), when this maximum count // is reached, it will throw out an Error. This Error needs to be caught so it can // go ahead to process the minibatch with lock acquired. LOG.warn("Error to get row lock for {}, in region {}, cause: {}", Bytes.toStringBinary(row), getRegionInfo().getRegionNameAsString(), error); IOException ioe = new IOException(error); throw ioe; } finally { // Clean up the counts just in case this was the thing keeping the context alive. if (!success && rowLockContext != null) { rowLockContext.cleanUp(); } } } private RowLock getRowLock(byte[] row, boolean readLock, final RowLock prevRowLock) throws IOException { return TraceUtil.trace(() -> getRowLockInternal(row, readLock, prevRowLock), () -> createRegionSpan("Region.getRowLock").setAttribute(ROW_LOCK_READ_LOCK_KEY, readLock)); } private void releaseRowLocks(List rowLocks) { if (rowLocks != null) { for (RowLock rowLock : rowLocks) { rowLock.release(); } rowLocks.clear(); } } public int getReadLockCount() { return lock.getReadLockCount(); } public ConcurrentHashMap getLockedRows() { return lockedRows; } class RowLockContext { private final HashedBytes row; final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(true); final AtomicBoolean usable = new AtomicBoolean(true); final AtomicInteger count = new AtomicInteger(0); final Object lock = new Object(); private String threadName; RowLockContext(HashedBytes row) { this.row = row; } RowLockImpl newWriteLock() { Lock l = readWriteLock.writeLock(); return getRowLock(l); } RowLockImpl newReadLock() { Lock l = readWriteLock.readLock(); return getRowLock(l); } private RowLockImpl getRowLock(Lock l) { count.incrementAndGet(); synchronized (lock) { if (usable.get()) { return new RowLockImpl(this, l); } else { return null; } } } void cleanUp() { long c = count.decrementAndGet(); if (c <= 0) { synchronized (lock) { if (count.get() <= 0 && usable.get()) { // Don't attempt to remove row if already removed usable.set(false); RowLockContext removed = lockedRows.remove(row); assert removed == this : "we should never remove a different context"; } } } } public void setThreadName(String threadName) { this.threadName = threadName; } @Override public String toString() { return "RowLockContext{" + "row=" + row + ", readWriteLock=" + readWriteLock + ", count=" + count + ", threadName=" + threadName + '}'; } } /** * Class used to represent a lock on a row. */ public static class RowLockImpl implements RowLock { private final RowLockContext context; private final Lock lock; public RowLockImpl(RowLockContext context, Lock lock) { this.context = context; this.lock = lock; } public Lock getLock() { return lock; } public RowLockContext getContext() { return context; } @Override public void release() { lock.unlock(); context.cleanUp(); } @Override public String toString() { return "RowLockImpl{" + "context=" + context + ", lock=" + lock + '}'; } } /** * Determines whether multiple column families are present Precondition: familyPaths is not null * @param familyPaths List of (column family, hfilePath) */ private static boolean hasMultipleColumnFamilies(Collection> familyPaths) { boolean multipleFamilies = false; byte[] family = null; for (Pair pair : familyPaths) { byte[] fam = pair.getFirst(); if (family == null) { family = fam; } else if (!Bytes.equals(family, fam)) { multipleFamilies = true; break; } } return multipleFamilies; } /** * Attempts to atomically load a group of hfiles. This is critical for loading rows with multiple * column families atomically. * @param familyPaths List of Pair<byte[] column family, String hfilePath> * @param bulkLoadListener Internal hooks enabling massaging/preparation of a file about to be * bulk loaded * @return Map from family to List of store file paths if successful, null if failed recoverably * @throws IOException if failed unrecoverably. */ public Map> bulkLoadHFiles(Collection> familyPaths, boolean assignSeqId, BulkLoadListener bulkLoadListener) throws IOException { return bulkLoadHFiles(familyPaths, assignSeqId, bulkLoadListener, false, null, true); } /** * Listener class to enable callers of bulkLoadHFile() to perform any necessary pre/post * processing of a given bulkload call */ public interface BulkLoadListener { /** * Called before an HFile is actually loaded * @param family family being loaded to * @param srcPath path of HFile * @return final path to be used for actual loading */ String prepareBulkLoad(byte[] family, String srcPath, boolean copyFile, String customStaging) throws IOException; /** * Called after a successful HFile load * @param family family being loaded to * @param srcPath path of HFile */ void doneBulkLoad(byte[] family, String srcPath) throws IOException; /** * Called after a failed HFile load * @param family family being loaded to * @param srcPath path of HFile */ void failedBulkLoad(byte[] family, String srcPath) throws IOException; } /** * Attempts to atomically load a group of hfiles. This is critical for loading rows with multiple * column families atomically. * @param familyPaths List of Pair<byte[] column family, String hfilePath> * @param bulkLoadListener Internal hooks enabling massaging/preparation of a file about to be * bulk loaded * @param copyFile always copy hfiles if true * @param clusterIds ids from clusters that had already handled the given bulkload event. * @return Map from family to List of store file paths if successful, null if failed recoverably * @throws IOException if failed unrecoverably. */ public Map> bulkLoadHFiles(Collection> familyPaths, boolean assignSeqId, BulkLoadListener bulkLoadListener, boolean copyFile, List clusterIds, boolean replicate) throws IOException { long seqId = -1; Map> storeFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); Map storeFilesSizes = new HashMap<>(); Preconditions.checkNotNull(familyPaths); // we need writeLock for multi-family bulk load startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths)); boolean isSuccessful = false; try { this.writeRequestsCount.increment(); // There possibly was a split that happened between when the split keys // were gathered and before the HRegion's write lock was taken. We need // to validate the HFile region before attempting to bulk load all of them IOException ioException = null; List> failures = new ArrayList<>(); for (Pair p : familyPaths) { byte[] familyName = p.getFirst(); String path = p.getSecond(); HStore store = getStore(familyName); if (store == null) { ioException = new org.apache.hadoop.hbase.DoNotRetryIOException( "No such column family " + Bytes.toStringBinary(familyName)); } else { try { store.assertBulkLoadHFileOk(new Path(path)); } catch (WrongRegionException wre) { // recoverable (file doesn't fit in region) failures.add(p); } catch (IOException ioe) { // unrecoverable (hdfs problem) ioException = ioe; } } // validation failed because of some sort of IO problem. if (ioException != null) { LOG.error("There was IO error when checking if the bulk load is ok in region {}.", this, ioException); throw ioException; } } // validation failed, bail out before doing anything permanent. if (failures.size() != 0) { StringBuilder list = new StringBuilder(); for (Pair p : failures) { list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ") .append(p.getSecond()); } // problem when validating LOG.warn("There was a recoverable bulk load failure likely due to a split. These (family," + " HFile) pairs were not loaded: {}, in region {}", list.toString(), this); return null; } // We need to assign a sequential ID that's in between two memstores in order to preserve // the guarantee that all the edits lower than the highest sequential ID from all the // HFiles are flushed on disk. See HBASE-10958. The sequence id returned when we flush is // guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is // a sequence id that we can be sure is beyond the last hfile written). if (assignSeqId) { FlushResult fs = flushcache(true, false, FlushLifeCycleTracker.DUMMY); if (fs.isFlushSucceeded()) { seqId = ((FlushResultImpl) fs).flushSequenceId; } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) { seqId = ((FlushResultImpl) fs).flushSequenceId; } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH) { // CANNOT_FLUSH may mean that a flush is already on-going // we need to wait for that flush to complete waitForFlushes(); } else { throw new IOException("Could not bulk load with an assigned sequential ID because the " + "flush didn't run. Reason for not flushing: " + ((FlushResultImpl) fs).failureReason); } } Map>> familyWithFinalPath = new TreeMap<>(Bytes.BYTES_COMPARATOR); for (Pair p : familyPaths) { byte[] familyName = p.getFirst(); String path = p.getSecond(); HStore store = getStore(familyName); if (!familyWithFinalPath.containsKey(familyName)) { familyWithFinalPath.put(familyName, new ArrayList<>()); } List> lst = familyWithFinalPath.get(familyName); String finalPath = path; try { boolean reqTmp = store.storeEngine.requireWritingToTmpDirFirst(); if (bulkLoadListener != null) { finalPath = bulkLoadListener.prepareBulkLoad(familyName, path, copyFile, reqTmp ? null : fs.getRegionDir().toString()); } Pair pair = null; if (reqTmp || !StoreFileInfo.isHFile(finalPath)) { pair = store.preBulkLoadHFile(finalPath, seqId); } else { Path livePath = new Path(finalPath); pair = new Pair<>(livePath, livePath); } lst.add(pair); } catch (IOException ioe) { // A failure here can cause an atomicity violation that we currently // cannot recover from since it is likely a failed HDFS operation. LOG.error("There was a partial failure due to IO when attempting to" + " load " + Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe); if (bulkLoadListener != null) { try { bulkLoadListener.failedBulkLoad(familyName, finalPath); } catch (Exception ex) { LOG.error("Error while calling failedBulkLoad for family " + Bytes.toString(familyName) + " with path " + path, ex); } } throw ioe; } } if (this.getCoprocessorHost() != null) { for (Map.Entry>> entry : familyWithFinalPath.entrySet()) { this.getCoprocessorHost().preCommitStoreFile(entry.getKey(), entry.getValue()); } } for (Map.Entry>> entry : familyWithFinalPath.entrySet()) { byte[] familyName = entry.getKey(); for (Pair p : entry.getValue()) { String path = p.getFirst().toString(); Path commitedStoreFile = p.getSecond(); HStore store = getStore(familyName); try { store.bulkLoadHFile(familyName, path, commitedStoreFile); // Note the size of the store file try { FileSystem fs = commitedStoreFile.getFileSystem(baseConf); storeFilesSizes.put(commitedStoreFile.getName(), fs.getFileStatus(commitedStoreFile).getLen()); } catch (IOException e) { LOG.warn("Failed to find the size of hfile " + commitedStoreFile, e); storeFilesSizes.put(commitedStoreFile.getName(), 0L); } if (storeFiles.containsKey(familyName)) { storeFiles.get(familyName).add(commitedStoreFile); } else { List storeFileNames = new ArrayList<>(); storeFileNames.add(commitedStoreFile); storeFiles.put(familyName, storeFileNames); } if (bulkLoadListener != null) { bulkLoadListener.doneBulkLoad(familyName, path); } } catch (IOException ioe) { // A failure here can cause an atomicity violation that we currently // cannot recover from since it is likely a failed HDFS operation. // TODO Need a better story for reverting partial failures due to HDFS. LOG.error("There was a partial failure due to IO when attempting to" + " load " + Bytes.toString(familyName) + " : " + p.getSecond(), ioe); if (bulkLoadListener != null) { try { bulkLoadListener.failedBulkLoad(familyName, path); } catch (Exception ex) { LOG.error("Error while calling failedBulkLoad for family " + Bytes.toString(familyName) + " with path " + path, ex); } } throw ioe; } } } isSuccessful = true; if (conf.getBoolean(COMPACTION_AFTER_BULKLOAD_ENABLE, true)) { // request compaction familyWithFinalPath.keySet().forEach(family -> { HStore store = getStore(family); try { if (this.rsServices != null && store.needsCompaction()) { this.rsServices.getCompactionRequestor().requestSystemCompaction(this, store, "bulkload hfiles request compaction", true); LOG.info("Request compaction for region {} family {} after bulk load", this.getRegionInfo().getEncodedName(), store.getColumnFamilyName()); } } catch (IOException e) { LOG.error("bulkload hfiles request compaction error ", e); } }); } } finally { if (wal != null && !storeFiles.isEmpty()) { // Write a bulk load event for hfiles that are loaded try { WALProtos.BulkLoadDescriptor loadDescriptor = ProtobufUtil.toBulkLoadDescriptor(this.getRegionInfo().getTable(), UnsafeByteOperations.unsafeWrap(this.getRegionInfo().getEncodedNameAsBytes()), storeFiles, storeFilesSizes, seqId, clusterIds, replicate); WALUtil.writeBulkLoadMarkerAndSync(this.wal, this.getReplicationScope(), getRegionInfo(), loadDescriptor, mvcc); } catch (IOException ioe) { if (this.rsServices != null) { // Have to abort region server because some hfiles has been loaded but we can't write // the event into WAL isSuccessful = false; this.rsServices.abort("Failed to write bulk load event into WAL.", ioe); } } } closeBulkRegionOperation(); } return isSuccessful ? storeFiles : null; } @Override public boolean equals(Object o) { return o instanceof HRegion && Bytes.equals(getRegionInfo().getRegionName(), ((HRegion) o).getRegionInfo().getRegionName()); } @Override public int hashCode() { return Bytes.hashCode(getRegionInfo().getRegionName()); } @Override public String toString() { return getRegionInfo().getRegionNameAsString(); } // Utility methods /** * A utility method to create new instances of HRegion based on the {@link HConstants#REGION_IMPL} * configuration property. * @param tableDir qualified path of directory where region should be located, usually the table * directory. * @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a * logfile from the previous execution that's custom-computed for this HRegion. * The HRegionServer computes and sorts the appropriate wal info for this * HRegion. If there is a previous file (implying that the HRegion has been * written-to before), then read it from the supplied path. * @param fs is the filesystem. * @param conf is global configuration settings. * @param regionInfo - RegionInfo that describes the region is new), then read them from the * supplied path. * @param htd the table descriptor * @return the new instance */ public static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs, Configuration conf, RegionInfo regionInfo, final TableDescriptor htd, RegionServerServices rsServices) { try { @SuppressWarnings("unchecked") Class regionClass = (Class) conf.getClass(HConstants.REGION_IMPL, HRegion.class); Constructor c = regionClass.getConstructor(Path.class, WAL.class, FileSystem.class, Configuration.class, RegionInfo.class, TableDescriptor.class, RegionServerServices.class); return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices); } catch (Throwable e) { // todo: what should I throw here? throw new IllegalStateException("Could not instantiate a region instance.", e); } } /** * Convenience method creating new HRegions. Used by createTable. * @param info Info for region to create. * @param rootDir Root directory for HBase instance * @param wal shared WAL * @param initialize - true to initialize the region * @return new HRegion */ public static HRegion createHRegion(final RegionInfo info, final Path rootDir, final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal, final boolean initialize) throws IOException { return createHRegion(info, rootDir, conf, hTableDescriptor, wal, initialize, null); } /** * Convenience method creating new HRegions. Used by createTable. * @param info Info for region to create. * @param rootDir Root directory for HBase instance * @param wal shared WAL * @param initialize - true to initialize the region * @param rsRpcServices An interface we can request flushes against. * @return new HRegion */ public static HRegion createHRegion(final RegionInfo info, final Path rootDir, final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal, final boolean initialize, RegionServerServices rsRpcServices) throws IOException { LOG.info("creating " + info + ", tableDescriptor=" + (hTableDescriptor == null ? "null" : hTableDescriptor) + ", regionDir=" + rootDir); createRegionDir(conf, info, rootDir); FileSystem fs = rootDir.getFileSystem(conf); Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); HRegion region = HRegion.newHRegion(tableDir, wal, fs, conf, info, hTableDescriptor, rsRpcServices); if (initialize) { region.initialize(null); } return region; } /** * Create a region under the given table directory. */ public static HRegion createHRegion(Configuration conf, RegionInfo regionInfo, FileSystem fs, Path tableDir, TableDescriptor tableDesc) throws IOException { LOG.info("Creating {}, tableDescriptor={}, under table dir {}", regionInfo, tableDesc, tableDir); HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, regionInfo); HRegion region = HRegion.newHRegion(tableDir, null, fs, conf, regionInfo, tableDesc, null); return region; } /** * Create the region directory in the filesystem. */ public static HRegionFileSystem createRegionDir(Configuration configuration, RegionInfo ri, Path rootDir) throws IOException { FileSystem fs = rootDir.getFileSystem(configuration); Path tableDir = CommonFSUtils.getTableDir(rootDir, ri.getTable()); // If directory already exists, will log warning and keep going. Will try to create // .regioninfo. If one exists, will overwrite. return HRegionFileSystem.createRegionOnFileSystem(configuration, fs, tableDir, ri); } public static HRegion createHRegion(final RegionInfo info, final Path rootDir, final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal) throws IOException { return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true); } /** * Open a Region. * @param info Info for region to be opened. * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing * the result of the call to HRegion#getMinSequenceId() to ensure the wal id is * properly kept up. HRegionStore does this every time it opens a new region. * @return new HRegion */ public static HRegion openHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal, final Configuration conf) throws IOException { return openHRegion(info, htd, wal, conf, null, null); } /** * Open a Region. * @param info Info for region to be opened * @param htd the table descriptor * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) * passing the result of the call to HRegion#getMinSequenceId() to ensure the * wal id is properly kept up. HRegionStore does this every time it opens a new * region. * @param conf The Configuration object to use. * @param rsServices An interface we can request flushes against. * @param reporter An interface we can report progress against. * @return new HRegion */ public static HRegion openHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal, final Configuration conf, final RegionServerServices rsServices, final CancelableProgressable reporter) throws IOException { return openHRegion(CommonFSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter); } /** * Open a Region. * @param rootDir Root directory for HBase instance * @param info Info for region to be opened. * @param htd the table descriptor * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing * the result of the call to HRegion#getMinSequenceId() to ensure the wal id is * properly kept up. HRegionStore does this every time it opens a new region. * @param conf The Configuration object to use. * @return new HRegion */ public static HRegion openHRegion(Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, final Configuration conf) throws IOException { return openHRegion(rootDir, info, htd, wal, conf, null, null); } /** * Open a Region. * @param rootDir Root directory for HBase instance * @param info Info for region to be opened. * @param htd the table descriptor * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) * passing the result of the call to HRegion#getMinSequenceId() to ensure the * wal id is properly kept up. HRegionStore does this every time it opens a new * region. * @param conf The Configuration object to use. * @param rsServices An interface we can request flushes against. * @param reporter An interface we can report progress against. * @return new HRegion */ public static HRegion openHRegion(final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, final Configuration conf, final RegionServerServices rsServices, final CancelableProgressable reporter) throws IOException { FileSystem fs = null; if (rsServices != null) { fs = rsServices.getFileSystem(); } if (fs == null) { fs = rootDir.getFileSystem(conf); } return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter); } /** * Open a Region. * @param conf The Configuration object to use. * @param fs Filesystem to use * @param rootDir Root directory for HBase instance * @param info Info for region to be opened. * @param htd the table descriptor * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing * the result of the call to HRegion#getMinSequenceId() to ensure the wal id is * properly kept up. HRegionStore does this every time it opens a new region. * @return new HRegion */ public static HRegion openHRegion(final Configuration conf, final FileSystem fs, final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal) throws IOException { return openHRegion(conf, fs, rootDir, info, htd, wal, null, null); } /** * Open a Region. * @param conf The Configuration object to use. * @param fs Filesystem to use * @param rootDir Root directory for HBase instance * @param info Info for region to be opened. * @param htd the table descriptor * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) * passing the result of the call to HRegion#getMinSequenceId() to ensure the * wal id is properly kept up. HRegionStore does this every time it opens a new * region. * @param rsServices An interface we can request flushes against. * @param reporter An interface we can report progress against. * @return new HRegion */ public static HRegion openHRegion(final Configuration conf, final FileSystem fs, final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, final RegionServerServices rsServices, final CancelableProgressable reporter) throws IOException { Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); return openHRegionFromTableDir(conf, fs, tableDir, info, htd, wal, rsServices, reporter); } /** * Open a Region. * @param conf The Configuration object to use. * @param fs Filesystem to use * @param info Info for region to be opened. * @param htd the table descriptor * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) * passing the result of the call to HRegion#getMinSequenceId() to ensure the * wal id is properly kept up. HRegionStore does this every time it opens a new * region. * @param rsServices An interface we can request flushes against. * @param reporter An interface we can report progress against. * @return new HRegion */ public static HRegion openHRegionFromTableDir(final Configuration conf, final FileSystem fs, final Path tableDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, final RegionServerServices rsServices, final CancelableProgressable reporter) throws IOException { Objects.requireNonNull(info, "RegionInfo cannot be null"); LOG.debug("Opening region: {}", info); HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices); return r.openHRegion(reporter); } public NavigableMap getReplicationScope() { return this.replicationScope; } /** * Useful when reopening a closed region (normally for unit tests) * @param other original object * @param reporter An interface we can report progress against. * @return new HRegion */ public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter) throws IOException { HRegionFileSystem regionFs = other.getRegionFileSystem(); HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(), other.baseConf, other.getRegionInfo(), other.getTableDescriptor(), null); return r.openHRegion(reporter); } public static Region openHRegion(final Region other, final CancelableProgressable reporter) throws IOException { return openHRegion((HRegion) other, reporter); } /** * Open HRegion. *

* Calls initialize and sets sequenceId. * @return Returns this */ private HRegion openHRegion(final CancelableProgressable reporter) throws IOException { try { CompoundConfiguration cConfig = new CompoundConfiguration().add(conf).addBytesMap(htableDescriptor.getValues()); // Refuse to open the region if we are missing local compression support TableDescriptorChecker.checkCompression(cConfig, htableDescriptor); // Refuse to open the region if encryption configuration is incorrect or // codec support is missing LOG.debug("checking encryption for " + this.getRegionInfo().getEncodedName()); TableDescriptorChecker.checkEncryption(cConfig, htableDescriptor); // Refuse to open the region if a required class cannot be loaded LOG.debug("checking classloading for " + this.getRegionInfo().getEncodedName()); TableDescriptorChecker.checkClassLoading(cConfig, htableDescriptor); this.openSeqNum = initialize(reporter); this.mvcc.advanceTo(openSeqNum); // The openSeqNum must be increased every time when a region is assigned, as we rely on it to // determine whether a region has been successfully reopened. So here we always write open // marker, even if the table is read only. if ( wal != null && getRegionServerServices() != null && RegionReplicaUtil.isDefaultReplica(getRegionInfo()) ) { writeRegionOpenMarker(wal, openSeqNum); } } catch (Throwable t) { // By coprocessor path wrong region will open failed, // MetricsRegionWrapperImpl is already init and not close, // add region close when open failed try { // It is not required to write sequence id file when region open is failed. // Passing true to skip the sequence id file write. this.close(true); } catch (Throwable e) { LOG.warn("Open region: {} failed. Try close region but got exception ", this.getRegionInfo(), e); } throw t; } return this; } /** * Open a Region on a read-only file-system (like hdfs snapshots) * @param conf The Configuration object to use. * @param fs Filesystem to use * @param info Info for region to be opened. * @param htd the table descriptor * @return new HRegion */ public static HRegion openReadOnlyFileSystemHRegion(final Configuration conf, final FileSystem fs, final Path tableDir, RegionInfo info, final TableDescriptor htd) throws IOException { if (info == null) { throw new NullPointerException("Passed region info is null"); } if (LOG.isDebugEnabled()) { LOG.debug("Opening region (readOnly filesystem): " + info); } if (info.getReplicaId() <= 0) { info = RegionInfoBuilder.newBuilder(info).setReplicaId(1).build(); } HRegion r = HRegion.newHRegion(tableDir, null, fs, conf, info, htd, null); r.writestate.setReadOnly(true); return r.openHRegion(null); } public static HRegion warmupHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal, final Configuration conf, final RegionServerServices rsServices, final CancelableProgressable reporter) throws IOException { Objects.requireNonNull(info, "RegionInfo cannot be null"); LOG.debug("Warmup {}", info); Path rootDir = CommonFSUtils.getRootDir(conf); Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); FileSystem fs = null; if (rsServices != null) { fs = rsServices.getFileSystem(); } if (fs == null) { fs = rootDir.getFileSystem(conf); } HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, null); r.initializeWarmup(reporter); r.close(); return r; } /** * Computes the Path of the HRegion * @param tabledir qualified path for table * @param name ENCODED region name * @return Path of HRegion directory * @deprecated For tests only; to be removed. */ @Deprecated public static Path getRegionDir(final Path tabledir, final String name) { return new Path(tabledir, name); } /** * Determines if the specified row is within the row range specified by the specified RegionInfo * @param info RegionInfo that specifies the row range * @param row row to be checked * @return true if the row is within the range specified by the RegionInfo */ public static boolean rowIsInRange(RegionInfo info, final byte[] row) { return ((info.getStartKey().length == 0) || (Bytes.compareTo(info.getStartKey(), row) <= 0)) && ((info.getEndKey().length == 0) || (Bytes.compareTo(info.getEndKey(), row) > 0)); } public static boolean rowIsInRange(RegionInfo info, final byte[] row, final int offset, final short length) { return ((info.getStartKey().length == 0) || (Bytes.compareTo(info.getStartKey(), 0, info.getStartKey().length, row, offset, length) <= 0)) && ((info.getEndKey().length == 0) || (Bytes.compareTo(info.getEndKey(), 0, info.getEndKey().length, row, offset, length) > 0)); } @Override public Result get(final Get get) throws IOException { prepareGet(get); List results = get(get, true); boolean stale = this.getRegionInfo().getReplicaId() != 0; return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale); } void prepareGet(final Get get) throws IOException { checkRow(get.getRow(), "Get"); // Verify families are all valid if (get.hasFamilies()) { for (byte[] family : get.familySet()) { checkFamily(family); } } else { // Adding all families to scanner for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { get.addFamily(family); } } } @Override public List get(Get get, boolean withCoprocessor) throws IOException { return get(get, withCoprocessor, HConstants.NO_NONCE, HConstants.NO_NONCE); } private List get(Get get, boolean withCoprocessor, long nonceGroup, long nonce) throws IOException { return TraceUtil.trace(() -> getInternal(get, withCoprocessor, nonceGroup, nonce), () -> createRegionSpan("Region.get")); } private List getInternal(Get get, boolean withCoprocessor, long nonceGroup, long nonce) throws IOException { List results = new ArrayList<>(); long before = EnvironmentEdgeManager.currentTime(); // pre-get CP hook if (withCoprocessor && (coprocessorHost != null)) { if (coprocessorHost.preGet(get, results)) { metricsUpdateForGet(results, before); return results; } } Scan scan = new Scan(get); if (scan.getLoadColumnFamiliesOnDemandValue() == null) { scan.setLoadColumnFamiliesOnDemand(isLoadingCfsOnDemandDefault()); } try (RegionScanner scanner = getScanner(scan, null, nonceGroup, nonce)) { List tmp = new ArrayList<>(); scanner.next(tmp); // Copy EC to heap, then close the scanner. // This can be an EXPENSIVE call. It may make an extra copy from offheap to onheap buffers. // See more details in HBASE-26036. for (Cell cell : tmp) { results.add(CellUtil.cloneIfNecessary(cell)); } } // post-get CP hook if (withCoprocessor && (coprocessorHost != null)) { coprocessorHost.postGet(get, results); } metricsUpdateForGet(results, before); return results; } void metricsUpdateForGet(List results, long before) { if (this.metricsRegion != null) { this.metricsRegion.updateGet(EnvironmentEdgeManager.currentTime() - before); } if (rsServices != null && this.rsServices.getMetrics() != null) { rsServices.getMetrics().updateReadQueryMeter(getTableDescriptor().getTableName(), 1); } } @Override public Result mutateRow(RowMutations rm) throws IOException { return mutateRow(rm, HConstants.NO_NONCE, HConstants.NO_NONCE); } public Result mutateRow(RowMutations rm, long nonceGroup, long nonce) throws IOException { final List m = rm.getMutations(); OperationStatus[] statuses = batchMutate(m.toArray(new Mutation[0]), true, nonceGroup, nonce); List results = new ArrayList<>(); for (OperationStatus status : statuses) { if (status.getResult() != null) { results.add(status.getResult()); } } if (results.isEmpty()) { return null; } // Merge the results of the Increment/Append operations List cells = new ArrayList<>(); for (Result result : results) { if (result.rawCells() != null) { cells.addAll(Arrays.asList(result.rawCells())); } } return Result.create(cells); } /** * Perform atomic (all or none) mutations within the region. * @param mutations The list of mutations to perform. mutations can contain * operations for multiple rows. Caller has to ensure that all rows are * contained in this region. * @param rowsToLock Rows to lock * @param nonceGroup Optional nonce group of the operation (client Id) * @param nonce Optional nonce of the operation (unique random id to ensure "more * idempotence") If multiple rows are locked care should be taken that * rowsToLock is sorted in order to avoid deadlocks. */ @Override public void mutateRowsWithLocks(Collection mutations, Collection rowsToLock, long nonceGroup, long nonce) throws IOException { batchMutate(new MutationBatchOperation(this, mutations.toArray(new Mutation[mutations.size()]), true, nonceGroup, nonce) { @Override public MiniBatchOperationInProgress lockRowsAndBuildMiniBatch(List acquiredRowLocks) throws IOException { RowLock prevRowLock = null; for (byte[] row : rowsToLock) { try { RowLock rowLock = region.getRowLock(row, false, prevRowLock); // write lock if (rowLock != prevRowLock) { acquiredRowLocks.add(rowLock); prevRowLock = rowLock; } } catch (IOException ioe) { LOG.warn("Failed getting lock, row={}, in region {}", Bytes.toStringBinary(row), this, ioe); throw ioe; } } return createMiniBatch(size(), size()); } }); } /** Returns statistics about the current load of the region */ public ClientProtos.RegionLoadStats getLoadStatistics() { if (!regionStatsEnabled) { return null; } ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder(); stats.setMemStoreLoad((int) (Math.min(100, (this.memStoreSizing.getMemStoreSize().getHeapSize() * 100) / this.memstoreFlushSize))); if (rsServices.getHeapMemoryManager() != null) { // the HeapMemoryManager uses -0.0 to signal a problem asking the JVM, // so we could just do the calculation below and we'll get a 0. // treating it as a special case analogous to no HMM instead so that it can be // programatically treated different from using <1% of heap. final float occupancy = rsServices.getHeapMemoryManager().getHeapOccupancyPercent(); if (occupancy != HeapMemoryManager.HEAP_OCCUPANCY_ERROR_VALUE) { stats.setHeapOccupancy((int) (occupancy * 100)); } } stats.setCompactionPressure((int) (rsServices.getCompactionPressure() * 100 > 100 ? 100 : rsServices.getCompactionPressure() * 100)); return stats.build(); } @Override public void processRowsWithLocks(RowProcessor processor) throws IOException { processRowsWithLocks(processor, rowProcessorTimeout, HConstants.NO_NONCE, HConstants.NO_NONCE); } @Override public void processRowsWithLocks(RowProcessor processor, long nonceGroup, long nonce) throws IOException { processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce); } @Override public void processRowsWithLocks(RowProcessor processor, long timeout, long nonceGroup, long nonce) throws IOException { for (byte[] row : processor.getRowsToLock()) { checkRow(row, "processRowsWithLocks"); } if (!processor.readOnly()) { checkReadOnly(); } checkResources(); startRegionOperation(); WALEdit walEdit = new WALEdit(); // STEP 1. Run pre-process hook preProcess(processor, walEdit); // Short circuit the read only case if (processor.readOnly()) { try { long now = EnvironmentEdgeManager.currentTime(); doProcessRowWithTimeout(processor, now, this, null, null, timeout); processor.postProcess(this, walEdit, true); } finally { closeRegionOperation(); } return; } boolean locked = false; List acquiredRowLocks = null; List mutations = new ArrayList<>(); Collection rowsToLock = processor.getRowsToLock(); // This is assigned by mvcc either explicity in the below or in the guts of the WAL append // when it assigns the edit a sequencedid (A.K.A the mvcc write number). WriteEntry writeEntry = null; MemStoreSizing memstoreAccounting = new NonThreadSafeMemStoreSizing(); // Check for thread interrupt status in case we have been signaled from // #interruptRegionOperation. checkInterrupt(); try { boolean success = false; try { // STEP 2. Acquire the row lock(s) acquiredRowLocks = new ArrayList<>(rowsToLock.size()); RowLock prevRowLock = null; for (byte[] row : rowsToLock) { // Attempt to lock all involved rows, throw if any lock times out // use a writer lock for mixed reads and writes RowLock rowLock = getRowLockInternal(row, false, prevRowLock); if (rowLock != prevRowLock) { acquiredRowLocks.add(rowLock); prevRowLock = rowLock; } } // Check for thread interrupt status in case we have been signaled from // #interruptRegionOperation. Do it before we take the lock and disable interrupts for // the WAL append. checkInterrupt(); // STEP 3. Region lock lock(this.updatesLock.readLock(), acquiredRowLocks.isEmpty() ? 1 : acquiredRowLocks.size()); locked = true; // From this point until memstore update this operation should not be interrupted. disableInterrupts(); long now = EnvironmentEdgeManager.currentTime(); // STEP 4. Let the processor scan the rows, generate mutations and add waledits doProcessRowWithTimeout(processor, now, this, mutations, walEdit, timeout); if (!mutations.isEmpty()) { writeRequestsCount.add(mutations.size()); // STEP 5. Call the preBatchMutate hook processor.preBatchMutate(this, walEdit); // STEP 6. Append and sync if walEdit has data to write out. if (!walEdit.isEmpty()) { writeEntry = doWALAppend(walEdit, getEffectiveDurability(processor.useDurability()), processor.getClusterIds(), now, nonceGroup, nonce); } else { // We are here if WAL is being skipped. writeEntry = this.mvcc.begin(); } // STEP 7. Apply to memstore long sequenceId = writeEntry.getWriteNumber(); for (Mutation m : mutations) { // Handle any tag based cell features. // TODO: Do we need to call rewriteCellTags down in applyToMemStore()? Why not before // so tags go into WAL? rewriteCellTags(m.getFamilyCellMap(), m); for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) { Cell cell = cellScanner.current(); if (walEdit.isEmpty()) { // If walEdit is empty, we put nothing in WAL. WAL stamps Cells with sequence id. // If no WAL, need to stamp it here. PrivateCellUtil.setSequenceId(cell, sequenceId); } applyToMemStore(getStore(cell), cell, memstoreAccounting); } } // STEP 8. call postBatchMutate hook processor.postBatchMutate(this); // STEP 9. Complete mvcc. mvcc.completeAndWait(writeEntry); writeEntry = null; // STEP 10. Release region lock if (locked) { this.updatesLock.readLock().unlock(); locked = false; } // STEP 11. Release row lock(s) releaseRowLocks(acquiredRowLocks); if (rsServices != null && rsServices.getMetrics() != null) { rsServices.getMetrics().updateWriteQueryMeter(this.htableDescriptor.getTableName(), mutations.size()); } } success = true; } finally { // Call complete rather than completeAndWait because we probably had error if walKey != null if (writeEntry != null) mvcc.complete(writeEntry); if (locked) { this.updatesLock.readLock().unlock(); } // release locks if some were acquired but another timed out releaseRowLocks(acquiredRowLocks); enableInterrupts(); } // 12. Run post-process hook processor.postProcess(this, walEdit, success); } finally { closeRegionOperation(); if (!mutations.isEmpty()) { this.incMemStoreSize(memstoreAccounting.getMemStoreSize()); requestFlushIfNeeded(); } } } private void preProcess(final RowProcessor processor, final WALEdit walEdit) throws IOException { try { processor.preProcess(this, walEdit); } catch (IOException e) { closeRegionOperation(); throw e; } } private void doProcessRowWithTimeout(final RowProcessor processor, final long now, final HRegion region, final List mutations, final WALEdit walEdit, final long timeout) throws IOException { // Short circuit the no time bound case. if (timeout < 0) { try { processor.process(now, region, mutations, walEdit); } catch (IOException e) { String row = processor.getRowsToLock().isEmpty() ? "" : " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "..."; LOG.warn("RowProcessor: {}, in region {}, throws Exception {}", processor.getClass().getName(), getRegionInfo().getRegionNameAsString(), row, e); throw e; } return; } // Case with time bound FutureTask task = new FutureTask<>(new Callable() { @Override public Void call() throws IOException { try { processor.process(now, region, mutations, walEdit); return null; } catch (IOException e) { String row = processor.getRowsToLock().isEmpty() ? "" : " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "..."; LOG.warn("RowProcessor: {}, in region {}, throws Exception {}", processor.getClass().getName(), getRegionInfo().getRegionNameAsString(), row, e); throw e; } } }); rowProcessorExecutor.execute(task); try { task.get(timeout, TimeUnit.MILLISECONDS); } catch (InterruptedException ie) { throw throwOnInterrupt(ie); } catch (TimeoutException te) { String row = processor.getRowsToLock().isEmpty() ? "" : " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "..."; LOG.error("RowProcessor timeout: {} ms, in region {}, {}", timeout, getRegionInfo().getRegionNameAsString(), row); throw new IOException(te); } catch (Exception e) { throw new IOException(e); } } @Override public Result append(Append append) throws IOException { return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE); } public Result append(Append append, long nonceGroup, long nonce) throws IOException { return TraceUtil.trace(() -> { checkReadOnly(); checkResources(); startRegionOperation(Operation.APPEND); try { // All edits for the given row (across all column families) must happen atomically. return mutate(append, true, nonceGroup, nonce).getResult(); } finally { closeRegionOperation(Operation.APPEND); } }, () -> createRegionSpan("Region.append")); } @Override public Result increment(Increment increment) throws IOException { return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE); } public Result increment(Increment increment, long nonceGroup, long nonce) throws IOException { return TraceUtil.trace(() -> { checkReadOnly(); checkResources(); startRegionOperation(Operation.INCREMENT); try { // All edits for the given row (across all column families) must happen atomically. return mutate(increment, true, nonceGroup, nonce).getResult(); } finally { closeRegionOperation(Operation.INCREMENT); } }, () -> createRegionSpan("Region.increment")); } private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List clusterIds, long now, long nonceGroup, long nonce) throws IOException { return doWALAppend(walEdit, durability, clusterIds, now, nonceGroup, nonce, SequenceId.NO_SEQUENCE_ID); } /** Returns writeEntry associated with this append */ private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List clusterIds, long now, long nonceGroup, long nonce, long origLogSeqNum) throws IOException { Preconditions.checkArgument(walEdit != null && !walEdit.isEmpty(), "WALEdit is null or empty!"); Preconditions.checkArgument(!walEdit.isReplay() || origLogSeqNum != SequenceId.NO_SEQUENCE_ID, "Invalid replay sequence Id for replay WALEdit!"); // Using default cluster id, as this can only happen in the originating cluster. // A slave cluster receives the final value (not the delta) as a Put. We use HLogKey // here instead of WALKeyImpl directly to support legacy coprocessors. WALKeyImpl walKey = walEdit.isReplay() ? new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(), this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds, nonceGroup, nonce, mvcc) : new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(), this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds, nonceGroup, nonce, mvcc, this.getReplicationScope()); if (walEdit.isReplay()) { walKey.setOrigLogSeqNum(origLogSeqNum); } // don't call the coproc hook for writes to the WAL caused by // system lifecycle events like flushes or compactions if (this.coprocessorHost != null && !walEdit.isMetaEdit()) { this.coprocessorHost.preWALAppend(walKey, walEdit); } WriteEntry writeEntry = null; try { long txid = this.wal.appendData(this.getRegionInfo(), walKey, walEdit); // Call sync on our edit. if (txid != 0) { sync(txid, durability); } writeEntry = walKey.getWriteEntry(); } catch (IOException ioe) { if (walKey != null && walKey.getWriteEntry() != null) { mvcc.complete(walKey.getWriteEntry()); } throw ioe; } return writeEntry; } // // New HBASE-880 Helpers // void checkFamily(final byte[] family) throws NoSuchColumnFamilyException { if (!this.htableDescriptor.hasColumnFamily(family)) { throw new NoSuchColumnFamilyException("Column family " + Bytes.toString(family) + " does not exist in region " + this + " in table " + this.htableDescriptor); } } public static final long FIXED_OVERHEAD = ClassSize.estimateBase(HRegion.class, false); // woefully out of date - currently missing: // 1 x HashMap - coprocessorServiceHandlers // 6 x LongAdder - numMutationsWithoutWAL, dataInMemoryWithoutWAL, // checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount, // writeRequestsCount // 1 x HRegion$WriteState - writestate // 1 x RegionCoprocessorHost - coprocessorHost // 1 x RegionSplitPolicy - splitPolicy // 1 x MetricsRegion - metricsRegion // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper public static final long DEEP_OVERHEAD = FIXED_OVERHEAD + ClassSize.OBJECT + // closeLock (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing (3 * ClassSize.ATOMIC_LONG) + // numPutsWithoutWAL, dataInMemoryWithoutWAL, // compactionsFailed (3 * ClassSize.CONCURRENT_HASHMAP) + // lockedRows, scannerReadPoints, regionLockHolders WriteState.HEAP_SIZE + // writestate ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock MultiVersionConcurrencyControl.FIXED_SIZE // mvcc + 2 * ClassSize.TREEMAP // maxSeqIdInStores, replicationScopes + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress + ClassSize.STORE_SERVICES // store services + StoreHotnessProtector.FIXED_SIZE; @Override public long heapSize() { // this does not take into account row locks, recent flushes, mvcc entries, and more return DEEP_OVERHEAD + stores.values().stream().mapToLong(HStore::heapSize).sum(); } /** * Registers a new protocol buffer {@link Service} subclass as a coprocessor endpoint to be * available for handling Region#execService(com.google.protobuf.RpcController, * org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall) calls. *

* Only a single instance may be registered per region for a given {@link Service} subclass (the * instances are keyed on {@link com.google.protobuf.Descriptors.ServiceDescriptor#getFullName()}. * After the first registration, subsequent calls with the same service name will fail with a * return value of {@code false}. *

* @param instance the {@code Service} subclass instance to expose as a coprocessor endpoint * @return {@code true} if the registration was successful, {@code false} otherwise */ public boolean registerService(com.google.protobuf.Service instance) { /* * No stacking of instances is allowed for a single service name */ com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType(); String serviceName = CoprocessorRpcUtils.getServiceName(serviceDesc); if (coprocessorServiceHandlers.containsKey(serviceName)) { LOG.error("Coprocessor service {} already registered, rejecting request from {} in region {}", serviceName, instance, this); return false; } coprocessorServiceHandlers.put(serviceName, instance); if (LOG.isDebugEnabled()) { LOG.debug("Registered coprocessor service: region=" + Bytes.toStringBinary(getRegionInfo().getRegionName()) + " service=" + serviceName); } return true; } /** * Executes a single protocol buffer coprocessor endpoint {@link Service} method using the * registered protocol handlers. {@link Service} implementations must be registered via the * {@link #registerService(com.google.protobuf.Service)} method before they are available. * @param controller an {@code RpcContoller} implementation to pass to the invoked service * @param call a {@code CoprocessorServiceCall} instance identifying the service, method, * and parameters for the method invocation * @return a protocol buffer {@code Message} instance containing the method's result * @throws IOException if no registered service handler is found or an error occurs during the * invocation * @see #registerService(com.google.protobuf.Service) */ public com.google.protobuf.Message execService(com.google.protobuf.RpcController controller, CoprocessorServiceCall call) throws IOException { String serviceName = call.getServiceName(); com.google.protobuf.Service service = coprocessorServiceHandlers.get(serviceName); if (service == null) { throw new UnknownProtocolException(null, "No registered coprocessor service found for " + serviceName + " in region " + Bytes.toStringBinary(getRegionInfo().getRegionName())); } com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType(); String methodName = call.getMethodName(); com.google.protobuf.Descriptors.MethodDescriptor methodDesc = CoprocessorRpcUtils.getMethodDescriptor(methodName, serviceDesc); com.google.protobuf.Message.Builder builder = service.getRequestPrototype(methodDesc).newBuilderForType(); org.apache.hadoop.hbase.protobuf.ProtobufUtil.mergeFrom(builder, call.getRequest().toByteArray()); com.google.protobuf.Message request = CoprocessorRpcUtils.getRequest(service, methodDesc, call.getRequest()); if (coprocessorHost != null) { request = coprocessorHost.preEndpointInvocation(service, methodName, request); } final com.google.protobuf.Message.Builder responseBuilder = service.getResponsePrototype(methodDesc).newBuilderForType(); service.callMethod(methodDesc, controller, request, new com.google.protobuf.RpcCallback() { @Override public void run(com.google.protobuf.Message message) { if (message != null) { responseBuilder.mergeFrom(message); } } }); if (coprocessorHost != null) { coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder); } IOException exception = org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils.getControllerException(controller); if (exception != null) { throw exception; } return responseBuilder.build(); } public Optional checkSplit() { return checkSplit(false); } /** * Return the split point. An empty result indicates the region isn't splittable. */ public Optional checkSplit(boolean force) { // Can't split META if ( this.getRegionInfo().isMetaRegion() || TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable()) ) { return Optional.empty(); } // Can't split a region that is closing. if (this.isClosing()) { return Optional.empty(); } if (!force && !splitPolicy.shouldSplit()) { return Optional.empty(); } byte[] ret = splitPolicy.getSplitPoint(); if (ret != null && ret.length > 0) { ret = splitRestriction.getRestrictedSplitPoint(ret); } if (ret != null) { try { checkRow(ret, "calculated split"); } catch (IOException e) { LOG.error("Ignoring invalid split for region {}", this, e); return Optional.empty(); } return Optional.of(ret); } else { return Optional.empty(); } } /** Returns The priority that this region should have in the compaction queue */ public int getCompactPriority() { if (conf.getBoolean(SPLIT_IGNORE_BLOCKING_ENABLED_KEY, false) && checkSplit().isPresent()) { // if a region should split, split it before compact return Store.PRIORITY_USER; } return stores.values().stream().mapToInt(HStore::getCompactPriority).min() .orElse(Store.NO_PRIORITY); } /** Returns the coprocessor host */ public RegionCoprocessorHost getCoprocessorHost() { return coprocessorHost; } /** @param coprocessorHost the new coprocessor host */ public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) { this.coprocessorHost = coprocessorHost; } @Override public void startRegionOperation() throws IOException { startRegionOperation(Operation.ANY); } @Override public void startRegionOperation(Operation op) throws IOException { boolean isInterruptableOp = false; switch (op) { case GET: // interruptible read operations case SCAN: isInterruptableOp = true; checkReadsEnabled(); break; case INCREMENT: // interruptible write operations case APPEND: case PUT: case DELETE: case BATCH_MUTATE: case CHECK_AND_MUTATE: isInterruptableOp = true; break; default: // all others break; } if ( op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION || op == Operation.COMPACT_REGION || op == Operation.COMPACT_SWITCH ) { // split, merge or compact region doesn't need to check the closing/closed state or lock the // region return; } if (this.closing.get()) { throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing"); } lock(lock.readLock()); // Update regionLockHolders ONLY for any startRegionOperation call that is invoked from // an RPC handler Thread thisThread = Thread.currentThread(); if (isInterruptableOp) { regionLockHolders.put(thisThread, true); } if (this.closed.get()) { lock.readLock().unlock(); throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed"); } // The unit for snapshot is a region. So, all stores for this region must be // prepared for snapshot operation before proceeding. if (op == Operation.SNAPSHOT) { stores.values().forEach(HStore::preSnapshotOperation); } try { if (coprocessorHost != null) { coprocessorHost.postStartRegionOperation(op); } } catch (Exception e) { if (isInterruptableOp) { // would be harmless to remove what we didn't add but we know by 'isInterruptableOp' // if we added this thread to regionLockHolders regionLockHolders.remove(thisThread); } lock.readLock().unlock(); throw new IOException(e); } } @Override public void closeRegionOperation() throws IOException { closeRegionOperation(Operation.ANY); } @Override public void closeRegionOperation(Operation operation) throws IOException { if (operation == Operation.SNAPSHOT) { stores.values().forEach(HStore::postSnapshotOperation); } Thread thisThread = Thread.currentThread(); regionLockHolders.remove(thisThread); lock.readLock().unlock(); if (coprocessorHost != null) { coprocessorHost.postCloseRegionOperation(operation); } } /** * This method needs to be called before any public call that reads or modifies stores in bulk. It * has to be called just before a try. #closeBulkRegionOperation needs to be called in the try's * finally block Acquires a writelock and checks if the region is closing or closed. * @throws NotServingRegionException when the region is closing or closed * @throws RegionTooBusyException if failed to get the lock in time * @throws InterruptedIOException if interrupted while waiting for a lock */ private void startBulkRegionOperation(boolean writeLockNeeded) throws IOException { if (this.closing.get()) { throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing"); } if (writeLockNeeded) lock(lock.writeLock()); else lock(lock.readLock()); if (this.closed.get()) { if (writeLockNeeded) lock.writeLock().unlock(); else lock.readLock().unlock(); throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed"); } regionLockHolders.put(Thread.currentThread(), true); } /** * Closes the lock. This needs to be called in the finally block corresponding to the try block of * #startRegionOperation */ private void closeBulkRegionOperation() { regionLockHolders.remove(Thread.currentThread()); if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock(); else lock.readLock().unlock(); } /** * Update LongAdders for number of puts without wal and the size of possible data loss. These * information are exposed by the region server metrics. */ private void recordMutationWithoutWal(final Map> familyMap) { numMutationsWithoutWAL.increment(); if (numMutationsWithoutWAL.sum() <= 1) { LOG.info("writing data to region " + this + " with WAL disabled. Data may be lost in the event of a crash."); } long mutationSize = 0; for (List cells : familyMap.values()) { // Optimization: 'foreach' loop is not used. See: // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects assert cells instanceof RandomAccess; int listSize = cells.size(); for (int i = 0; i < listSize; i++) { Cell cell = cells.get(i); mutationSize += cell.getSerializedSize(); } } dataInMemoryWithoutWAL.add(mutationSize); } private void lock(final Lock lock) throws IOException { lock(lock, 1); } /** * Try to acquire a lock. Throw RegionTooBusyException if failed to get the lock in time. Throw * InterruptedIOException if interrupted while waiting for the lock. */ private void lock(final Lock lock, final int multiplier) throws IOException { try { final long waitTime = Math.min(maxBusyWaitDuration, busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier)); if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) { // Don't print millis. Message is used as a key over in // RetriesExhaustedWithDetailsException processing. final String regionName = this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getRegionNameAsString(); final String serverName = this.getRegionServerServices() == null ? "unknown" : (this.getRegionServerServices().getServerName() == null ? "unknown" : this.getRegionServerServices().getServerName().toString()); RegionTooBusyException rtbe = new RegionTooBusyException( "Failed to obtain lock; regionName=" + regionName + ", server=" + serverName); LOG.warn("Region is too busy to allow lock acquisition.", rtbe); throw rtbe; } } catch (InterruptedException ie) { if (LOG.isDebugEnabled()) { LOG.debug("Interrupted while waiting for a lock in region {}", this); } throw throwOnInterrupt(ie); } } /** * Calls sync with the given transaction ID * @param txid should sync up to which transaction * @throws IOException If anything goes wrong with DFS */ private void sync(long txid, Durability durability) throws IOException { if (this.getRegionInfo().isMetaRegion()) { this.wal.sync(txid); } else { switch (durability) { case USE_DEFAULT: // do what table defaults to if (shouldSyncWAL()) { this.wal.sync(txid); } break; case SKIP_WAL: // nothing do to break; case ASYNC_WAL: // nothing do to break; case SYNC_WAL: this.wal.sync(txid, false); break; case FSYNC_WAL: this.wal.sync(txid, true); break; default: throw new RuntimeException("Unknown durability " + durability); } } } /** * Check whether we should sync the wal from the table's durability settings */ private boolean shouldSyncWAL() { return regionDurability.ordinal() > Durability.ASYNC_WAL.ordinal(); } /** Returns the latest sequence number that was read from storage when this region was opened */ public long getOpenSeqNum() { return this.openSeqNum; } @Override public Map getMaxStoreSeqId() { return this.maxSeqIdInStores; } public long getOldestSeqIdOfStore(byte[] familyName) { return wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), familyName); } @Override public CompactionState getCompactionState() { boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0; return (hasMajor ? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR) : (hasMinor ? CompactionState.MINOR : CompactionState.NONE)); } public void reportCompactionRequestStart(boolean isMajor) { (isMajor ? majorInProgress : minorInProgress).incrementAndGet(); } public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted) { int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet(); // metrics compactionsFinished.increment(); compactionNumFilesCompacted.add(numFiles); compactionNumBytesCompacted.add(filesSizeCompacted); assert newValue >= 0; } public void reportCompactionRequestFailure() { compactionsFailed.increment(); } public void incrementCompactionsQueuedCount() { compactionsQueued.increment(); } public void decrementCompactionsQueuedCount() { compactionsQueued.decrement(); } public void incrementFlushesQueuedCount() { flushesQueued.increment(); } protected void decrementFlushesQueuedCount() { flushesQueued.decrement(); } /** * If a handler thread is eligible for interrupt, make it ineligible. Should be paired with * {{@link #enableInterrupts()}. */ void disableInterrupts() { regionLockHolders.computeIfPresent(Thread.currentThread(), (t, b) -> false); } /** * If a handler thread was made ineligible for interrupt via {{@link #disableInterrupts()}, make * it eligible again. No-op if interrupts are already enabled. */ void enableInterrupts() { regionLockHolders.computeIfPresent(Thread.currentThread(), (t, b) -> true); } /** * Interrupt any region options that have acquired the region lock via * {@link #startRegionOperation(org.apache.hadoop.hbase.regionserver.Region.Operation)}, or * {@link #startBulkRegionOperation(boolean)}. */ private void interruptRegionOperations() { for (Map.Entry entry : regionLockHolders.entrySet()) { // An entry in this map will have a boolean value indicating if it is currently // eligible for interrupt; if so, we should interrupt it. if (entry.getValue().booleanValue()) { entry.getKey().interrupt(); } } } /** * Check thread interrupt status and throw an exception if interrupted. * @throws NotServingRegionException if region is closing * @throws InterruptedIOException if interrupted but region is not closing */ // Package scope for tests void checkInterrupt() throws NotServingRegionException, InterruptedIOException { if (Thread.interrupted()) { if (this.closing.get()) { throw new NotServingRegionException( getRegionInfo().getRegionNameAsString() + " is closing"); } throw new InterruptedIOException(); } } /** * Throw the correct exception upon interrupt * @param t cause */ // Package scope for tests IOException throwOnInterrupt(Throwable t) { if (this.closing.get()) { return (NotServingRegionException) new NotServingRegionException( getRegionInfo().getRegionNameAsString() + " is closing").initCause(t); } return (InterruptedIOException) new InterruptedIOException().initCause(t); } /** * {@inheritDoc} */ @Override public void onConfigurationChange(Configuration conf) { this.storeHotnessProtector.update(conf); // update coprocessorHost if the configuration has changed. if ( CoprocessorConfigurationUtil.checkConfigurationChange(getReadOnlyConfiguration(), conf, CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, CoprocessorHost.USER_REGION_COPROCESSOR_CONF_KEY) ) { LOG.info("Update the system coprocessors because the configuration has changed"); decorateRegionConfiguration(conf); this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf); } } /** * {@inheritDoc} */ @Override public void registerChildren(ConfigurationManager manager) { configurationManager = manager; stores.values().forEach(manager::registerObserver); } /** * {@inheritDoc} */ @Override public void deregisterChildren(ConfigurationManager manager) { stores.values().forEach(configurationManager::deregisterObserver); } @Override public CellComparator getCellComparator() { return cellComparator; } public long getMemStoreFlushSize() { return this.memstoreFlushSize; } //// method for debugging tests void throwException(String title, String regionName) { StringBuilder buf = new StringBuilder(); buf.append(title + ", "); buf.append(getRegionInfo().toString()); buf.append(getRegionInfo().isMetaRegion() ? " meta region " : " "); buf.append("stores: "); for (HStore s : stores.values()) { buf.append(s.getColumnFamilyDescriptor().getNameAsString()); buf.append(" size: "); buf.append(s.getMemStoreSize().getDataSize()); buf.append(" "); } buf.append("end-of-stores"); buf.append(", memstore size "); buf.append(getMemStoreDataSize()); if (getRegionInfo().getRegionNameAsString().startsWith(regionName)) { throw new RuntimeException(buf.toString()); } } @Override public void requestCompaction(String why, int priority, boolean major, CompactionLifeCycleTracker tracker) throws IOException { if (major) { stores.values().forEach(HStore::triggerMajorCompaction); } rsServices.getCompactionRequestor().requestCompaction(this, why, priority, tracker, RpcServer.getRequestUser().orElse(null)); } @Override public void requestCompaction(byte[] family, String why, int priority, boolean major, CompactionLifeCycleTracker tracker) throws IOException { HStore store = stores.get(family); if (store == null) { throw new NoSuchColumnFamilyException("column family " + Bytes.toString(family) + " does not exist in region " + getRegionInfo().getRegionNameAsString()); } if (major) { store.triggerMajorCompaction(); } rsServices.getCompactionRequestor().requestCompaction(this, store, why, priority, tracker, RpcServer.getRequestUser().orElse(null)); } private void requestFlushIfNeeded() throws RegionTooBusyException { if (isFlushSize(this.memStoreSizing.getMemStoreSize())) { requestFlush(); } } private void requestFlush() { if (this.rsServices == null) { return; } requestFlush0(FlushLifeCycleTracker.DUMMY); } private void requestFlush0(FlushLifeCycleTracker tracker) { boolean shouldFlush = false; synchronized (writestate) { if (!this.writestate.isFlushRequested()) { shouldFlush = true; writestate.flushRequested = true; } } if (shouldFlush) { // Make request outside of synchronize block; HBASE-818. this.rsServices.getFlushRequester().requestFlush(this, tracker); if (LOG.isDebugEnabled()) { LOG.debug("Flush requested on " + this.getRegionInfo().getEncodedName()); } } else { tracker.notExecuted("Flush already requested on " + this); } } @Override public void requestFlush(FlushLifeCycleTracker tracker) throws IOException { requestFlush0(tracker); } /** * This method modifies the region's configuration in order to inject replication-related features * @param conf region configurations */ private static void decorateRegionConfiguration(Configuration conf) { if (ReplicationUtils.isReplicationForBulkLoadDataEnabled(conf)) { String plugins = conf.get(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, ""); String replicationCoprocessorClass = ReplicationObserver.class.getCanonicalName(); if (!plugins.contains(replicationCoprocessorClass)) { conf.set(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, (plugins.equals("") ? "" : (plugins + ",")) + replicationCoprocessorClass); } } } public void addReadRequestsCount(long readRequestsCount) { this.readRequestsCount.add(readRequestsCount); } public void addWriteRequestsCount(long writeRequestsCount) { this.writeRequestsCount.add(writeRequestsCount); } @RestrictedApi(explanation = "Should only be called in tests", link = "", allowedOnPath = ".*/src/test/.*") boolean isReadsEnabled() { return this.writestate.readsEnabled; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy