org.apache.hadoop.hbase.regionserver.HRegion Maven / Gradle / Ivy
Show all versions of hbase-server Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver;
import static org.apache.hadoop.hbase.HConstants.REPLICATION_SCOPE_LOCAL;
import static org.apache.hadoop.hbase.regionserver.HStoreFile.MAJOR_COMPACTION_KEY;
import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.REGION_NAMES_KEY;
import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.ROW_LOCK_READ_LOCK_KEY;
import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent;
import com.google.errorprone.annotations.RestrictedApi;
import edu.umd.cs.findbugs.annotations.Nullable;
import io.opentelemetry.api.trace.Span;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.lang.reflect.Constructor;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.NavigableSet;
import java.util.Objects;
import java.util.Optional;
import java.util.RandomAccess;
import java.util.Set;
import java.util.TreeMap;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.LongAdder;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellBuilderType;
import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.CellComparatorImpl;
import org.apache.hadoop.hbase.CellScanner;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.CompareOperator;
import org.apache.hadoop.hbase.CompoundConfiguration;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.DroppedSnapshotException;
import org.apache.hadoop.hbase.ExtendedCellBuilderFactory;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
import org.apache.hadoop.hbase.HDFSBlocksDistribution;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.MetaCellComparator;
import org.apache.hadoop.hbase.NamespaceDescriptor;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.PrivateCellUtil;
import org.apache.hadoop.hbase.RegionTooBusyException;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.Tag;
import org.apache.hadoop.hbase.TagUtil;
import org.apache.hadoop.hbase.client.Append;
import org.apache.hadoop.hbase.client.CheckAndMutate;
import org.apache.hadoop.hbase.client.CheckAndMutateResult;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.CompactionState;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Increment;
import org.apache.hadoop.hbase.client.IsolationLevel;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Row;
import org.apache.hadoop.hbase.client.RowMutations;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.conf.ConfigurationManager;
import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver;
import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
import org.apache.hadoop.hbase.coprocessor.ReadOnlyConfiguration;
import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
import org.apache.hadoop.hbase.exceptions.TimeoutIOException;
import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.ByteArrayComparable;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.io.HFileLink;
import org.apache.hadoop.hbase.io.HeapSize;
import org.apache.hadoop.hbase.io.TimeRange;
import org.apache.hadoop.hbase.io.hfile.BlockCache;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils;
import org.apache.hadoop.hbase.ipc.RpcCall;
import org.apache.hadoop.hbase.ipc.RpcServer;
import org.apache.hadoop.hbase.mob.MobFileCache;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.quotas.RegionServerSpaceQuotaManager;
import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl.WriteEntry;
import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
import org.apache.hadoop.hbase.regionserver.compactions.CompactionLifeCycleTracker;
import org.apache.hadoop.hbase.regionserver.throttle.CompactionThroughputControllerFactory;
import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController;
import org.apache.hadoop.hbase.regionserver.throttle.StoreHotnessProtector;
import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController;
import org.apache.hadoop.hbase.regionserver.wal.WALUtil;
import org.apache.hadoop.hbase.replication.ReplicationUtils;
import org.apache.hadoop.hbase.replication.regionserver.ReplicationObserver;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CancelableProgressable;
import org.apache.hadoop.hbase.util.ClassSize;
import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hbase.util.CoprocessorConfigurationUtil;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.HashedBytes;
import org.apache.hadoop.hbase.util.NonceKey;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
import org.apache.hadoop.hbase.util.TableDescriptorChecker;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.wal.WAL;
import org.apache.hadoop.hbase.wal.WALEdit;
import org.apache.hadoop.hbase.wal.WALFactory;
import org.apache.hadoop.hbase.wal.WALKey;
import org.apache.hadoop.hbase.wal.WALKeyImpl;
import org.apache.hadoop.hbase.wal.WALSplitUtil;
import org.apache.hadoop.hbase.wal.WALSplitUtil.MutationReplay;
import org.apache.hadoop.util.StringUtils;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
import org.apache.hbase.thirdparty.com.google.protobuf.Service;
import org.apache.hbase.thirdparty.com.google.protobuf.TextFormat;
import org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations;
import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.CoprocessorServiceCall;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.RegionLoad;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.CompactionDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.FlushAction;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor.EventType;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.StoreDescriptor;
/**
* Regions store data for a certain region of a table. It stores all columns for each row. A given
* table consists of one or more Regions.
*
* An Region is defined by its table and its key extent.
*
* Locking at the Region level serves only one purpose: preventing the region from being closed (and
* consequently split) while other operations are ongoing. Each row level operation obtains both a
* row lock and a region read lock for the duration of the operation. While a scanner is being
* constructed, getScanner holds a read lock. If the scanner is successfully constructed, it holds a
* read lock until it is closed. A close takes out a write lock and consequently will block for
* ongoing operations and will block new operations from starting while the close is in progress.
*/
@SuppressWarnings("deprecation")
@InterfaceAudience.Private
public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region {
private static final Logger LOG = LoggerFactory.getLogger(HRegion.class);
public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY =
"hbase.hregion.scan.loadColumnFamiliesOnDemand";
public static final String HBASE_MAX_CELL_SIZE_KEY = "hbase.server.keyvalue.maxsize";
public static final int DEFAULT_MAX_CELL_SIZE = 10485760;
public static final String HBASE_REGIONSERVER_MINIBATCH_SIZE =
"hbase.regionserver.minibatch.size";
public static final int DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE = 20000;
public static final String WAL_HSYNC_CONF_KEY = "hbase.wal.hsync";
public static final boolean DEFAULT_WAL_HSYNC = false;
/** Parameter name for compaction after bulkload */
public static final String COMPACTION_AFTER_BULKLOAD_ENABLE =
"hbase.compaction.after.bulkload.enable";
/** Config for allow split when file count greater than the configured blocking file count */
public static final String SPLIT_IGNORE_BLOCKING_ENABLED_KEY =
"hbase.hregion.split.ignore.blocking.enabled";
public static final String REGION_STORAGE_POLICY_KEY = "hbase.hregion.block.storage.policy";
public static final String DEFAULT_REGION_STORAGE_POLICY = "NONE";
/**
* This is for for using HRegion as a local storage, where we may put the recovered edits in a
* special place. Once this is set, we will only replay the recovered edits under this directory
* and ignore the original replay directory configs.
*/
public static final String SPECIAL_RECOVERED_EDITS_DIR =
"hbase.hregion.special.recovered.edits.dir";
/**
* Whether to use {@link MetaCellComparator} even if we are not meta region. Used when creating
* master local region.
*/
public static final String USE_META_CELL_COMPARATOR = "hbase.region.use.meta.cell.comparator";
public static final boolean DEFAULT_USE_META_CELL_COMPARATOR = false;
final AtomicBoolean closed = new AtomicBoolean(false);
/*
* Closing can take some time; use the closing flag if there is stuff we don't want to do while in
* closing state; e.g. like offer this region up to the master as a region to close if the
* carrying regionserver is overloaded. Once set, it is never cleared.
*/
final AtomicBoolean closing = new AtomicBoolean(false);
/**
* The max sequence id of flushed data on this region. There is no edit in memory that is less
* that this sequence id.
*/
private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM;
/**
* Record the sequence id of last flush operation. Can be in advance of {@link #maxFlushedSeqId}
* when flushing a single column family. In this case, {@link #maxFlushedSeqId} will be older than
* the oldest edit in memory.
*/
private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM;
/**
* The sequence id of the last replayed open region event from the primary region. This is used to
* skip entries before this due to the possibility of replay edits coming out of order from
* replication.
*/
protected volatile long lastReplayedOpenRegionSeqId = -1L;
protected volatile long lastReplayedCompactionSeqId = -1L;
//////////////////////////////////////////////////////////////////////////////
// Members
//////////////////////////////////////////////////////////////////////////////
// map from a locked row to the context for that lock including:
// - CountDownLatch for threads waiting on that row
// - the thread that owns the lock (allow reentrancy)
// - reference count of (reentrant) locks held by the thread
// - the row itself
private final ConcurrentHashMap lockedRows =
new ConcurrentHashMap<>();
protected final Map stores =
new ConcurrentSkipListMap<>(Bytes.BYTES_RAWCOMPARATOR);
// TODO: account for each registered handler in HeapSize computation
private Map coprocessorServiceHandlers = Maps.newHashMap();
// Track data size in all memstores
private final MemStoreSizing memStoreSizing = new ThreadSafeMemStoreSizing();
RegionServicesForStores regionServicesForStores;
// Debug possible data loss due to WAL off
final LongAdder numMutationsWithoutWAL = new LongAdder();
final LongAdder dataInMemoryWithoutWAL = new LongAdder();
// Debug why CAS operations are taking a while.
final LongAdder checkAndMutateChecksPassed = new LongAdder();
final LongAdder checkAndMutateChecksFailed = new LongAdder();
// Number of requests
// Count rows for scan
final LongAdder readRequestsCount = new LongAdder();
final LongAdder filteredReadRequestsCount = new LongAdder();
// Count rows for multi row mutations
final LongAdder writeRequestsCount = new LongAdder();
// Number of requests blocked by memstore size.
private final LongAdder blockedRequestsCount = new LongAdder();
// Compaction LongAdders
final LongAdder compactionsFinished = new LongAdder();
final LongAdder compactionsFailed = new LongAdder();
final LongAdder compactionNumFilesCompacted = new LongAdder();
final LongAdder compactionNumBytesCompacted = new LongAdder();
final LongAdder compactionsQueued = new LongAdder();
final LongAdder flushesQueued = new LongAdder();
private BlockCache blockCache;
private MobFileCache mobFileCache;
private final WAL wal;
private final HRegionFileSystem fs;
protected final Configuration conf;
private final Configuration baseConf;
private final int rowLockWaitDuration;
static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;
private Path regionWalDir;
private FileSystem walFS;
// set to true if the region is restored from snapshot
private boolean isRestoredRegion = false;
public void setRestoredRegion(boolean restoredRegion) {
isRestoredRegion = restoredRegion;
}
// The internal wait duration to acquire a lock before read/update
// from the region. It is not per row. The purpose of this wait time
// is to avoid waiting a long time while the region is busy, so that
// we can release the IPC handler soon enough to improve the
// availability of the region server. It can be adjusted by
// tuning configuration "hbase.busy.wait.duration".
final long busyWaitDuration;
static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
// If updating multiple rows in one call, wait longer,
// i.e. waiting for busyWaitDuration * # of rows. However,
// we can limit the max multiplier.
final int maxBusyWaitMultiplier;
// Max busy wait duration. There is no point to wait longer than the RPC
// purge timeout, when a RPC call will be terminated by the RPC engine.
final long maxBusyWaitDuration;
// Max cell size. If nonzero, the maximum allowed size for any given cell
// in bytes
final long maxCellSize;
// Number of mutations for minibatch processing.
private final int miniBatchSize;
// negative number indicates infinite timeout
static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L;
final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool();
final ConcurrentHashMap scannerReadPoints;
final ReadPointCalculationLock smallestReadPointCalcLock;
/**
* The sequence ID that was enLongAddered when this region was opened.
*/
private long openSeqNum = HConstants.NO_SEQNUM;
/**
* The default setting for whether to enable on-demand CF loading for scan requests to this
* region. Requests can override it.
*/
private boolean isLoadingCfsOnDemandDefault = false;
private final AtomicInteger majorInProgress = new AtomicInteger(0);
private final AtomicInteger minorInProgress = new AtomicInteger(0);
//
// Context: During replay we want to ensure that we do not lose any data. So, we
// have to be conservative in how we replay wals. For each store, we calculate
// the maxSeqId up to which the store was flushed. And, skip the edits which
// are equal to or lower than maxSeqId for each store.
// The following map is populated when opening the region
Map maxSeqIdInStores = new TreeMap<>(Bytes.BYTES_COMPARATOR);
/** Saved state from replaying prepare flush cache */
private PrepareFlushResult prepareFlushResult = null;
private volatile ConfigurationManager configurationManager;
// Used for testing.
private volatile Long timeoutForWriteLock = null;
private final CellComparator cellComparator;
/**
* @return The smallest mvcc readPoint across all the scanners in this region. Writes older than
* this readPoint, are included in every read operation.
*/
public long getSmallestReadPoint() {
// We need to ensure that while we are calculating the smallestReadPoint
// no new RegionScanners can grab a readPoint that we are unaware of.
smallestReadPointCalcLock.lock(ReadPointCalculationLock.LockType.CALCULATION_LOCK);
try {
long minimumReadPoint = mvcc.getReadPoint();
for (Long readPoint : this.scannerReadPoints.values()) {
minimumReadPoint = Math.min(minimumReadPoint, readPoint);
}
return minimumReadPoint;
} finally {
smallestReadPointCalcLock.unlock(ReadPointCalculationLock.LockType.CALCULATION_LOCK);
}
}
/*
* Data structure of write state flags used coordinating flushes, compactions and closes.
*/
static class WriteState {
// Set while a memstore flush is happening.
volatile boolean flushing = false;
// Set when a flush has been requested.
volatile boolean flushRequested = false;
// Number of compactions running.
AtomicInteger compacting = new AtomicInteger(0);
// Gets set in close. If set, cannot compact or flush again.
volatile boolean writesEnabled = true;
// Set if region is read-only
volatile boolean readOnly = false;
// whether the reads are enabled. This is different than readOnly, because readOnly is
// static in the lifetime of the region, while readsEnabled is dynamic
volatile boolean readsEnabled = true;
/**
* Set flags that make this region read-only.
* @param onOff flip value for region r/o setting
*/
synchronized void setReadOnly(final boolean onOff) {
this.writesEnabled = !onOff;
this.readOnly = onOff;
}
boolean isReadOnly() {
return this.readOnly;
}
boolean isFlushRequested() {
return this.flushRequested;
}
void setReadsEnabled(boolean readsEnabled) {
this.readsEnabled = readsEnabled;
}
static final long HEAP_SIZE = ClassSize.align(ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN);
}
/**
* Objects from this class are created when flushing to describe all the different states that
* that method ends up in. The Result enum describes those states. The sequence id should only be
* specified if the flush was successful, and the failure message should only be specified if it
* didn't flush.
*/
public static class FlushResultImpl implements FlushResult {
final Result result;
final String failureReason;
final long flushSequenceId;
final boolean wroteFlushWalMarker;
/**
* Convenience constructor to use when the flush is successful, the failure message is set to
* null.
* @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED.
* @param flushSequenceId Generated sequence id that comes right after the edits in the
* memstores.
*/
FlushResultImpl(Result result, long flushSequenceId) {
this(result, flushSequenceId, null, false);
assert result == Result.FLUSHED_NO_COMPACTION_NEEDED
|| result == Result.FLUSHED_COMPACTION_NEEDED;
}
/**
* Convenience constructor to use when we cannot flush.
* @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH.
* @param failureReason Reason why we couldn't flush.
*/
FlushResultImpl(Result result, String failureReason, boolean wroteFlushMarker) {
this(result, -1, failureReason, wroteFlushMarker);
assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH;
}
/**
* Constructor with all the parameters.
* @param result Any of the Result.
* @param flushSequenceId Generated sequence id if the memstores were flushed else -1.
* @param failureReason Reason why we couldn't flush, or null.
*/
FlushResultImpl(Result result, long flushSequenceId, String failureReason,
boolean wroteFlushMarker) {
this.result = result;
this.flushSequenceId = flushSequenceId;
this.failureReason = failureReason;
this.wroteFlushWalMarker = wroteFlushMarker;
}
/**
* Convenience method, the equivalent of checking if result is FLUSHED_NO_COMPACTION_NEEDED or
* FLUSHED_NO_COMPACTION_NEEDED.
* @return true if the memstores were flushed, else false.
*/
@Override
public boolean isFlushSucceeded() {
return result == Result.FLUSHED_NO_COMPACTION_NEEDED
|| result == Result.FLUSHED_COMPACTION_NEEDED;
}
/**
* Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED.
* @return True if the flush requested a compaction, else false (doesn't even mean it flushed).
*/
@Override
public boolean isCompactionNeeded() {
return result == Result.FLUSHED_COMPACTION_NEEDED;
}
@Override
public String toString() {
return new StringBuilder().append("flush result:").append(result).append(", ")
.append("failureReason:").append(failureReason).append(",").append("flush seq id")
.append(flushSequenceId).toString();
}
@Override
public Result getResult() {
return result;
}
}
/** A result object from prepare flush cache stage */
static class PrepareFlushResult {
final FlushResultImpl result; // indicating a failure result from prepare
final TreeMap storeFlushCtxs;
final TreeMap> committedFiles;
final TreeMap storeFlushableSize;
final long startTime;
final long flushOpSeqId;
final long flushedSeqId;
final MemStoreSizing totalFlushableSize;
/** Constructs an early exit case */
PrepareFlushResult(FlushResultImpl result, long flushSeqId) {
this(result, null, null, null, Math.max(0, flushSeqId), 0, 0, MemStoreSizing.DUD);
}
/** Constructs a successful prepare flush result */
PrepareFlushResult(TreeMap storeFlushCtxs,
TreeMap> committedFiles, TreeMap storeFlushableSize,
long startTime, long flushSeqId, long flushedSeqId, MemStoreSizing totalFlushableSize) {
this(null, storeFlushCtxs, committedFiles, storeFlushableSize, startTime, flushSeqId,
flushedSeqId, totalFlushableSize);
}
private PrepareFlushResult(FlushResultImpl result,
TreeMap storeFlushCtxs, TreeMap> committedFiles,
TreeMap storeFlushableSize, long startTime, long flushSeqId,
long flushedSeqId, MemStoreSizing totalFlushableSize) {
this.result = result;
this.storeFlushCtxs = storeFlushCtxs;
this.committedFiles = committedFiles;
this.storeFlushableSize = storeFlushableSize;
this.startTime = startTime;
this.flushOpSeqId = flushSeqId;
this.flushedSeqId = flushedSeqId;
this.totalFlushableSize = totalFlushableSize;
}
public FlushResult getResult() {
return this.result;
}
}
/**
* A class that tracks exceptions that have been observed in one batch. Not thread safe.
*/
static class ObservedExceptionsInBatch {
private boolean wrongRegion = false;
private boolean failedSanityCheck = false;
private boolean wrongFamily = false;
/** Returns If a {@link WrongRegionException} has been observed. */
boolean hasSeenWrongRegion() {
return wrongRegion;
}
/**
* Records that a {@link WrongRegionException} has been observed.
*/
void sawWrongRegion() {
wrongRegion = true;
}
/** Returns If a {@link FailedSanityCheckException} has been observed. */
boolean hasSeenFailedSanityCheck() {
return failedSanityCheck;
}
/**
* Records that a {@link FailedSanityCheckException} has been observed.
*/
void sawFailedSanityCheck() {
failedSanityCheck = true;
}
/** Returns If a {@link NoSuchColumnFamilyException} has been observed. */
boolean hasSeenNoSuchFamily() {
return wrongFamily;
}
/**
* Records that a {@link NoSuchColumnFamilyException} has been observed.
*/
void sawNoSuchFamily() {
wrongFamily = true;
}
}
final WriteState writestate = new WriteState();
long memstoreFlushSize;
final long timestampSlop;
final long rowProcessorTimeout;
// Last flush time for each Store. Useful when we are flushing for each column
private final ConcurrentMap lastStoreFlushTimeMap = new ConcurrentHashMap<>();
protected RegionServerServices rsServices;
private RegionServerAccounting rsAccounting;
private long flushCheckInterval;
// flushPerChanges is to prevent too many changes in memstore
private long flushPerChanges;
private long blockingMemStoreSize;
// Used to guard closes
final ReentrantReadWriteLock lock;
// Used to track interruptible holders of the region lock. Currently that is only RPC handler
// threads. Boolean value in map determines if lock holder can be interrupted, normally true,
// but may be false when thread is transiting a critical section.
final ConcurrentHashMap regionLockHolders;
// Stop updates lock
private final ReentrantReadWriteLock updatesLock = new ReentrantReadWriteLock();
private final MultiVersionConcurrencyControl mvcc;
// Coprocessor host
private volatile RegionCoprocessorHost coprocessorHost;
private TableDescriptor htableDescriptor = null;
private RegionSplitPolicy splitPolicy;
private RegionSplitRestriction splitRestriction;
private FlushPolicy flushPolicy;
private final MetricsRegion metricsRegion;
private final MetricsRegionWrapperImpl metricsRegionWrapper;
private final Durability regionDurability;
private final boolean regionStatsEnabled;
// Stores the replication scope of the various column families of the table
// that has non-default scope
private final NavigableMap replicationScope =
new TreeMap<>(Bytes.BYTES_COMPARATOR);
private final StoreHotnessProtector storeHotnessProtector;
/**
* HRegion constructor. This constructor should only be used for testing and extensions. Instances
* of HRegion should be instantiated with the {@link HRegion#createHRegion} or
* {@link HRegion#openHRegion} method.
* @param tableDir qualified path of directory where region should be located, usually the table
* directory.
* @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a
* logfile from the previous execution that's custom-computed for this HRegion.
* The HRegionServer computes and sorts the appropriate wal info for this
* HRegion. If there is a previous wal file (implying that the HRegion has been
* written-to before), then read it from the supplied path.
* @param fs is the filesystem.
* @param confParam is global configuration settings.
* @param regionInfo - RegionInfo that describes the region is new), then read them from the
* supplied path.
* @param htd the table descriptor
* @param rsServices reference to {@link RegionServerServices} or null
* @deprecated Use other constructors.
*/
@Deprecated
public HRegion(final Path tableDir, final WAL wal, final FileSystem fs,
final Configuration confParam, final RegionInfo regionInfo, final TableDescriptor htd,
final RegionServerServices rsServices) {
this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo), wal, confParam, htd,
rsServices);
}
/**
* HRegion constructor. This constructor should only be used for testing and extensions. Instances
* of HRegion should be instantiated with the {@link HRegion#createHRegion} or
* {@link HRegion#openHRegion} method.
* @param fs is the filesystem.
* @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a
* logfile from the previous execution that's custom-computed for this HRegion.
* The HRegionServer computes and sorts the appropriate wal info for this
* HRegion. If there is a previous wal file (implying that the HRegion has been
* written-to before), then read it from the supplied path.
* @param confParam is global configuration settings.
* @param htd the table descriptor
* @param rsServices reference to {@link RegionServerServices} or null
*/
public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam,
final TableDescriptor htd, final RegionServerServices rsServices) {
if (htd == null) {
throw new IllegalArgumentException("Need table descriptor");
}
if (confParam instanceof CompoundConfiguration) {
throw new IllegalArgumentException("Need original base configuration");
}
this.wal = wal;
this.fs = fs;
this.mvcc = new MultiVersionConcurrencyControl(getRegionInfo().getShortNameToLog());
// 'conf' renamed to 'confParam' b/c we use this.conf in the constructor
this.baseConf = confParam;
this.conf = new CompoundConfiguration().add(confParam).addBytesMap(htd.getValues());
this.cellComparator = htd.isMetaTable()
|| conf.getBoolean(USE_META_CELL_COMPARATOR, DEFAULT_USE_META_CELL_COMPARATOR)
? MetaCellComparator.META_COMPARATOR
: CellComparatorImpl.COMPARATOR;
this.lock = new ReentrantReadWriteLock(
conf.getBoolean(FAIR_REENTRANT_CLOSE_LOCK, DEFAULT_FAIR_REENTRANT_CLOSE_LOCK));
this.regionLockHolders = new ConcurrentHashMap<>();
this.flushCheckInterval =
conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL, DEFAULT_CACHE_FLUSH_INTERVAL);
this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES);
if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) {
throw new IllegalArgumentException(
MEMSTORE_FLUSH_PER_CHANGES + " can not exceed " + MAX_FLUSH_PER_CHANGES);
}
int tmpRowLockDuration =
conf.getInt("hbase.rowlock.wait.duration", DEFAULT_ROWLOCK_WAIT_DURATION);
if (tmpRowLockDuration <= 0) {
LOG.info("Found hbase.rowlock.wait.duration set to {}. values <= 0 will cause all row "
+ "locking to fail. Treating it as 1ms to avoid region failure.", tmpRowLockDuration);
tmpRowLockDuration = 1;
}
this.rowLockWaitDuration = tmpRowLockDuration;
this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true);
this.htableDescriptor = htd;
Set families = this.htableDescriptor.getColumnFamilyNames();
for (byte[] family : families) {
if (!replicationScope.containsKey(family)) {
int scope = htd.getColumnFamily(family).getScope();
// Only store those families that has NON-DEFAULT scope
if (scope != REPLICATION_SCOPE_LOCAL) {
// Do a copy before storing it here.
replicationScope.put(Bytes.copy(family), scope);
}
}
}
this.rsServices = rsServices;
if (rsServices != null) {
this.blockCache = rsServices.getBlockCache().orElse(null);
this.mobFileCache = rsServices.getMobFileCache().orElse(null);
}
this.regionServicesForStores = new RegionServicesForStores(this, rsServices);
setHTableSpecificConf();
this.scannerReadPoints = new ConcurrentHashMap<>();
this.smallestReadPointCalcLock = new ReadPointCalculationLock(conf);
this.busyWaitDuration = conf.getLong("hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION);
this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2);
if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) {
throw new IllegalArgumentException("Invalid hbase.busy.wait.duration (" + busyWaitDuration
+ ") or hbase.busy.wait.multiplier.max (" + maxBusyWaitMultiplier
+ "). Their product should be positive");
}
this.maxBusyWaitDuration =
conf.getLong("hbase.ipc.client.call.purge.timeout", 2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
/*
* timestamp.slop provides a server-side constraint on the timestamp. This assumes that you base
* your TS around EnvironmentEdgeManager.currentTime(). In this case, throw an error to the user
* if the user-specified TS is newer than now + slop. LATEST_TIMESTAMP == don't use this
* functionality
*/
this.timestampSlop =
conf.getLong("hbase.hregion.keyvalue.timestamp.slop.millisecs", HConstants.LATEST_TIMESTAMP);
/**
* Timeout for the process time in processRowsWithLocks(). Use -1 to switch off time bound.
*/
this.rowProcessorTimeout =
conf.getLong("hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT);
this.storeHotnessProtector = new StoreHotnessProtector(this, conf);
boolean forceSync = conf.getBoolean(WAL_HSYNC_CONF_KEY, DEFAULT_WAL_HSYNC);
/**
* This is the global default value for durability. All tables/mutations not defining a
* durability or using USE_DEFAULT will default to this value.
*/
Durability defaultDurability = forceSync ? Durability.FSYNC_WAL : Durability.SYNC_WAL;
this.regionDurability = this.htableDescriptor.getDurability() == Durability.USE_DEFAULT
? defaultDurability
: this.htableDescriptor.getDurability();
decorateRegionConfiguration(conf);
if (rsServices != null) {
this.rsAccounting = this.rsServices.getRegionServerAccounting();
// don't initialize coprocessors if not running within a regionserver
// TODO: revisit if coprocessors should load in other cases
this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this);
this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper, conf);
} else {
this.metricsRegionWrapper = null;
this.metricsRegion = null;
}
if (LOG.isDebugEnabled()) {
// Write out region name, its encoded name and storeHotnessProtector as string.
LOG.debug("Instantiated " + this + "; " + storeHotnessProtector.toString());
}
configurationManager = null;
// disable stats tracking system tables, but check the config for everything else
this.regionStatsEnabled = htd.getTableName().getNamespaceAsString()
.equals(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR)
? false
: conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE,
HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE);
this.maxCellSize = conf.getLong(HBASE_MAX_CELL_SIZE_KEY, DEFAULT_MAX_CELL_SIZE);
this.miniBatchSize =
conf.getInt(HBASE_REGIONSERVER_MINIBATCH_SIZE, DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE);
// recover the metrics of read and write requests count if they were retained
if (rsServices != null && rsServices.getRegionServerAccounting() != null) {
Pair retainedRWRequestsCnt = rsServices.getRegionServerAccounting()
.getRetainedRegionRWRequestsCnt().get(getRegionInfo().getEncodedName());
if (retainedRWRequestsCnt != null) {
this.addReadRequestsCount(retainedRWRequestsCnt.getFirst());
this.addWriteRequestsCount(retainedRWRequestsCnt.getSecond());
// remove them since won't use again
rsServices.getRegionServerAccounting().getRetainedRegionRWRequestsCnt()
.remove(getRegionInfo().getEncodedName());
}
}
}
private void setHTableSpecificConf() {
if (this.htableDescriptor == null) {
return;
}
long flushSize = this.htableDescriptor.getMemStoreFlushSize();
if (flushSize <= 0) {
flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
TableDescriptorBuilder.DEFAULT_MEMSTORE_FLUSH_SIZE);
}
this.memstoreFlushSize = flushSize;
long mult = conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER,
HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER);
this.blockingMemStoreSize = this.memstoreFlushSize * mult;
}
/**
* Initialize this region. Used only by tests and SplitTransaction to reopen the region. You
* should use createHRegion() or openHRegion()
* @return What the next sequence (edit) id should be.
* @throws IOException e
* @deprecated use HRegion.createHRegion() or HRegion.openHRegion()
*/
@Deprecated
public long initialize() throws IOException {
return initialize(null);
}
/**
* Initialize this region.
* @param reporter Tickle every so often if initialize is taking a while.
* @return What the next sequence (edit) id should be.
*/
long initialize(final CancelableProgressable reporter) throws IOException {
// Refuse to open the region if there is no column family in the table
if (htableDescriptor.getColumnFamilyCount() == 0) {
throw new DoNotRetryIOException("Table " + htableDescriptor.getTableName().getNameAsString()
+ " should have at least one column family.");
}
MonitoredTask status =
TaskMonitor.get().createStatus("Initializing region " + this, false, true);
long nextSeqId = -1;
try {
nextSeqId = initializeRegionInternals(reporter, status);
return nextSeqId;
} catch (IOException e) {
LOG.warn("Failed initialize of region= {}, starting to roll back memstore",
getRegionInfo().getRegionNameAsString(), e);
// global memstore size will be decreased when dropping memstore
try {
// drop the memory used by memstore if open region fails
dropMemStoreContents();
} catch (IOException ioE) {
if (conf.getBoolean(MemStoreLAB.USEMSLAB_KEY, MemStoreLAB.USEMSLAB_DEFAULT)) {
LOG.warn(
"Failed drop memstore of region= {}, "
+ "some chunks may not released forever since MSLAB is enabled",
getRegionInfo().getRegionNameAsString());
}
}
throw e;
} finally {
// nextSeqid will be -1 if the initialization fails.
// At least it will be 0 otherwise.
if (nextSeqId == -1) {
status.abort("Exception during region " + getRegionInfo().getRegionNameAsString()
+ " initialization.");
}
if (LOG.isDebugEnabled()) {
LOG.debug("Region open journal for {}:\n{}", this.getRegionInfo().getEncodedName(),
status.prettyPrintJournal());
}
status.cleanup();
}
}
private long initializeRegionInternals(final CancelableProgressable reporter,
final MonitoredTask status) throws IOException {
if (coprocessorHost != null) {
status.setStatus("Running coprocessor pre-open hook");
coprocessorHost.preOpen();
}
String policyName = this.conf.get(REGION_STORAGE_POLICY_KEY, DEFAULT_REGION_STORAGE_POLICY);
this.fs.setStoragePolicy(policyName.trim());
// Write HRI to a file in case we need to recover hbase:meta
// Only the primary replica should write .regioninfo
if (this.getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID) {
status.setStatus("Writing region info on filesystem");
fs.checkRegionInfoOnFilesystem();
}
// Initialize all the HStores
status.setStatus("Initializing all the Stores");
long maxSeqId = initializeStores(reporter, status);
this.mvcc.advanceTo(maxSeqId);
if (!isRestoredRegion && ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this)) {
Collection stores = this.stores.values();
try {
// update the stores that we are replaying
stores.forEach(HStore::startReplayingFromWAL);
// Recover any edits if available.
maxSeqId =
Math.max(maxSeqId, replayRecoveredEditsIfAny(maxSeqIdInStores, reporter, status));
// Recover any hfiles if available
maxSeqId = Math.max(maxSeqId, loadRecoveredHFilesIfAny(stores));
// Make sure mvcc is up to max.
this.mvcc.advanceTo(maxSeqId);
} finally {
// update the stores that we are done replaying
stores.forEach(HStore::stopReplayingFromWAL);
}
}
this.lastReplayedOpenRegionSeqId = maxSeqId;
this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this));
this.writestate.flushRequested = false;
this.writestate.compacting.set(0);
if (this.writestate.writesEnabled) {
// Remove temporary data left over from old regions
status.setStatus("Cleaning up temporary data from old regions");
fs.cleanupTempDir();
}
// Initialize split policy
this.splitPolicy = RegionSplitPolicy.create(this, conf);
// Initialize split restriction
splitRestriction = RegionSplitRestriction.create(getTableDescriptor(), conf);
// Initialize flush policy
this.flushPolicy = FlushPolicyFactory.create(this, conf);
long lastFlushTime = EnvironmentEdgeManager.currentTime();
for (HStore store : stores.values()) {
this.lastStoreFlushTimeMap.put(store, lastFlushTime);
}
// Use maximum of log sequenceid or that which was found in stores
// (particularly if no recovered edits, seqid will be -1).
long nextSeqId = maxSeqId + 1;
if (!isRestoredRegion) {
// always get openSeqNum from the default replica, even if we are secondary replicas
long maxSeqIdFromFile = WALSplitUtil.getMaxRegionSequenceId(conf,
RegionReplicaUtil.getRegionInfoForDefaultReplica(getRegionInfo()), this::getFilesystem,
this::getWalFileSystem);
nextSeqId = Math.max(maxSeqId, maxSeqIdFromFile) + 1;
// The openSeqNum will always be increase even for read only region, as we rely on it to
// determine whether a region has been successfully reopened, so here we always need to update
// the max sequence id file.
if (RegionReplicaUtil.isDefaultReplica(getRegionInfo())) {
LOG.debug("writing seq id for {}", this.getRegionInfo().getEncodedName());
WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(),
nextSeqId - 1);
// This means we have replayed all the recovered edits and also written out the max sequence
// id file, let's delete the wrong directories introduced in HBASE-20734, see HBASE-22617
// for more details.
Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf,
getRegionInfo().getTable(), getRegionInfo().getEncodedName());
FileSystem walFs = getWalFileSystem();
if (walFs.exists(wrongRegionWALDir)) {
if (!walFs.delete(wrongRegionWALDir, true)) {
LOG.debug("Failed to clean up wrong region WAL directory {}", wrongRegionWALDir);
}
}
}
}
LOG.info("Opened {}; next sequenceid={}; {}, {}", this.getRegionInfo().getShortNameToLog(),
nextSeqId, this.splitPolicy, this.flushPolicy);
// A region can be reopened if failed a split; reset flags
this.closing.set(false);
this.closed.set(false);
if (coprocessorHost != null) {
status.setStatus("Running coprocessor post-open hooks");
coprocessorHost.postOpen();
}
status.markComplete("Region opened successfully");
return nextSeqId;
}
/**
* Open all Stores.
* @return Highest sequenceId found out in a Store.
*/
private long initializeStores(CancelableProgressable reporter, MonitoredTask status)
throws IOException {
return initializeStores(reporter, status, false);
}
private long initializeStores(CancelableProgressable reporter, MonitoredTask status,
boolean warmup) throws IOException {
// Load in all the HStores.
long maxSeqId = -1;
// initialized to -1 so that we pick up MemstoreTS from column families
long maxMemstoreTS = -1;
if (htableDescriptor.getColumnFamilyCount() != 0) {
// initialize the thread pool for opening stores in parallel.
ThreadPoolExecutor storeOpenerThreadPool =
getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog());
CompletionService completionService =
new ExecutorCompletionService<>(storeOpenerThreadPool);
// initialize each store in parallel
for (final ColumnFamilyDescriptor family : htableDescriptor.getColumnFamilies()) {
status.setStatus("Instantiating store for column family " + family);
completionService.submit(new Callable() {
@Override
public HStore call() throws IOException {
return instantiateHStore(family, warmup);
}
});
}
boolean allStoresOpened = false;
boolean hasSloppyStores = false;
try {
for (int i = 0; i < htableDescriptor.getColumnFamilyCount(); i++) {
Future future = completionService.take();
HStore store = future.get();
this.stores.put(store.getColumnFamilyDescriptor().getName(), store);
if (store.isSloppyMemStore()) {
hasSloppyStores = true;
}
long storeMaxSequenceId = store.getMaxSequenceId().orElse(0L);
maxSeqIdInStores.put(Bytes.toBytes(store.getColumnFamilyName()), storeMaxSequenceId);
if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) {
maxSeqId = storeMaxSequenceId;
}
long maxStoreMemstoreTS = store.getMaxMemStoreTS().orElse(0L);
if (maxStoreMemstoreTS > maxMemstoreTS) {
maxMemstoreTS = maxStoreMemstoreTS;
}
}
allStoresOpened = true;
if (hasSloppyStores) {
htableDescriptor = TableDescriptorBuilder.newBuilder(htableDescriptor)
.setFlushPolicyClassName(FlushNonSloppyStoresFirstPolicy.class.getName()).build();
LOG.info("Setting FlushNonSloppyStoresFirstPolicy for the region=" + this);
}
} catch (InterruptedException e) {
throw throwOnInterrupt(e);
} catch (ExecutionException e) {
throw new IOException(e.getCause());
} finally {
storeOpenerThreadPool.shutdownNow();
if (!allStoresOpened) {
// something went wrong, close all opened stores
LOG.error("Could not initialize all stores for the region=" + this);
for (HStore store : this.stores.values()) {
try {
store.close();
} catch (IOException e) {
LOG.warn("close store {} failed in region {}", store.toString(), this, e);
}
}
}
}
}
return Math.max(maxSeqId, maxMemstoreTS + 1);
}
private void initializeWarmup(final CancelableProgressable reporter) throws IOException {
MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
// Initialize all the HStores
status.setStatus("Warmup all stores of " + this.getRegionInfo().getRegionNameAsString());
try {
initializeStores(reporter, status, true);
} finally {
status.markComplete("Warmed up " + this.getRegionInfo().getRegionNameAsString());
}
}
/** Returns Map of StoreFiles by column family */
private NavigableMap> getStoreFiles() {
NavigableMap> allStoreFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR);
for (HStore store : stores.values()) {
Collection storeFiles = store.getStorefiles();
if (storeFiles == null) {
continue;
}
List storeFileNames = new ArrayList<>();
for (HStoreFile storeFile : storeFiles) {
storeFileNames.add(storeFile.getPath());
}
allStoreFiles.put(store.getColumnFamilyDescriptor().getName(), storeFileNames);
}
return allStoreFiles;
}
protected void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException {
Map> storeFiles = getStoreFiles();
RegionEventDescriptor regionOpenDesc =
ProtobufUtil.toRegionEventDescriptor(RegionEventDescriptor.EventType.REGION_OPEN,
getRegionInfo(), openSeqId, getRegionServerServices().getServerName(), storeFiles);
WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionOpenDesc,
mvcc);
}
private void writeRegionCloseMarker(WAL wal) throws IOException {
Map> storeFiles = getStoreFiles();
RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor(
RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), mvcc.getReadPoint(),
getRegionServerServices().getServerName(), storeFiles);
WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionEventDesc,
mvcc);
// Store SeqId in WAL FileSystem when a region closes
// checking region folder exists is due to many tests which delete the table folder while a
// table is still online
if (getWalFileSystem().exists(getWALRegionDir())) {
WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(),
mvcc.getReadPoint());
}
}
/** Returns True if this region has references. */
public boolean hasReferences() {
return stores.values().stream().anyMatch(HStore::hasReferences);
}
public void blockUpdates() {
this.updatesLock.writeLock().lock();
}
public void unblockUpdates() {
this.updatesLock.writeLock().unlock();
}
public HDFSBlocksDistribution getHDFSBlocksDistribution() {
HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
stores.values().stream().filter(s -> s.getStorefiles() != null)
.flatMap(s -> s.getStorefiles().stream()).map(HStoreFile::getHDFSBlockDistribution)
.forEachOrdered(hdfsBlocksDistribution::add);
return hdfsBlocksDistribution;
}
/**
* This is a helper function to compute HDFS block distribution on demand
* @param conf configuration
* @param tableDescriptor TableDescriptor of the table
* @param regionInfo encoded name of the region
* @return The HDFS blocks distribution for the given region.
*/
public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf,
TableDescriptor tableDescriptor, RegionInfo regionInfo) throws IOException {
Path tablePath =
CommonFSUtils.getTableDir(CommonFSUtils.getRootDir(conf), tableDescriptor.getTableName());
return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath);
}
/**
* This is a helper function to compute HDFS block distribution on demand
* @param conf configuration
* @param tableDescriptor TableDescriptor of the table
* @param regionInfo encoded name of the region
* @param tablePath the table directory
* @return The HDFS blocks distribution for the given region.
*/
public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf,
TableDescriptor tableDescriptor, RegionInfo regionInfo, Path tablePath) throws IOException {
HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
FileSystem fs = tablePath.getFileSystem(conf);
HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
for (ColumnFamilyDescriptor family : tableDescriptor.getColumnFamilies()) {
List locatedFileStatusList =
HRegionFileSystem.getStoreFilesLocatedStatus(regionFs, family.getNameAsString(), true);
if (locatedFileStatusList == null) {
continue;
}
for (LocatedFileStatus status : locatedFileStatusList) {
Path p = status.getPath();
if (StoreFileInfo.isReference(p) || HFileLink.isHFileLink(p)) {
// Only construct StoreFileInfo object if its not a hfile, save obj
// creation
StoreFileInfo storeFileInfo = new StoreFileInfo(conf, fs, status);
hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs));
} else if (StoreFileInfo.isHFile(p)) {
// If its a HFile, then lets just add to the block distribution
// lets not create more objects here, not even another HDFSBlocksDistribution
FSUtils.addToHDFSBlocksDistribution(hdfsBlocksDistribution, status.getBlockLocations());
} else {
throw new IOException("path=" + p + " doesn't look like a valid StoreFile");
}
}
}
return hdfsBlocksDistribution;
}
/**
* Increase the size of mem store in this region and the size of global mem store
*/
private void incMemStoreSize(MemStoreSize mss) {
incMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(),
mss.getCellsCount());
}
void incMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta,
int cellsCountDelta) {
if (this.rsAccounting != null) {
rsAccounting.incGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta);
}
long dataSize = this.memStoreSizing.incMemStoreSize(dataSizeDelta, heapSizeDelta,
offHeapSizeDelta, cellsCountDelta);
checkNegativeMemStoreDataSize(dataSize, dataSizeDelta);
}
void decrMemStoreSize(MemStoreSize mss) {
decrMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(),
mss.getCellsCount());
}
private void decrMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta,
int cellsCountDelta) {
if (this.rsAccounting != null) {
rsAccounting.decGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta);
}
long dataSize = this.memStoreSizing.decMemStoreSize(dataSizeDelta, heapSizeDelta,
offHeapSizeDelta, cellsCountDelta);
checkNegativeMemStoreDataSize(dataSize, -dataSizeDelta);
}
private void checkNegativeMemStoreDataSize(long memStoreDataSize, long delta) {
// This is extremely bad if we make memStoreSizing negative. Log as much info on the offending
// caller as possible. (memStoreSizing might be a negative value already -- freeing memory)
if (memStoreDataSize < 0) {
LOG.error("Asked to modify this region's (" + this.toString()
+ ") memStoreSizing to a negative value which is incorrect. Current memStoreSizing="
+ (memStoreDataSize - delta) + ", delta=" + delta, new Exception());
}
}
@Override
public RegionInfo getRegionInfo() {
return this.fs.getRegionInfo();
}
/** Returns Instance of {@link RegionServerServices} used by this HRegion. Can be null. */
RegionServerServices getRegionServerServices() {
return this.rsServices;
}
@Override
public long getReadRequestsCount() {
return readRequestsCount.sum();
}
@Override
public long getFilteredReadRequestsCount() {
return filteredReadRequestsCount.sum();
}
@Override
public long getWriteRequestsCount() {
return writeRequestsCount.sum();
}
@Override
public long getMemStoreDataSize() {
return memStoreSizing.getDataSize();
}
@Override
public long getMemStoreHeapSize() {
return memStoreSizing.getHeapSize();
}
@Override
public long getMemStoreOffHeapSize() {
return memStoreSizing.getOffHeapSize();
}
/** Returns store services for this region, to access services required by store level needs */
public RegionServicesForStores getRegionServicesForStores() {
return regionServicesForStores;
}
@Override
public long getNumMutationsWithoutWAL() {
return numMutationsWithoutWAL.sum();
}
@Override
public long getDataInMemoryWithoutWAL() {
return dataInMemoryWithoutWAL.sum();
}
@Override
public long getBlockedRequestsCount() {
return blockedRequestsCount.sum();
}
@Override
public long getCheckAndMutateChecksPassed() {
return checkAndMutateChecksPassed.sum();
}
@Override
public long getCheckAndMutateChecksFailed() {
return checkAndMutateChecksFailed.sum();
}
// TODO Needs to check whether we should expose our metrics system to CPs. If CPs themselves doing
// the op and bypassing the core, this might be needed? Should be stop supporting the bypass
// feature?
public MetricsRegion getMetrics() {
return metricsRegion;
}
@Override
public boolean isClosed() {
return this.closed.get();
}
@Override
public boolean isClosing() {
return this.closing.get();
}
@Override
public boolean isReadOnly() {
return this.writestate.isReadOnly();
}
@Override
public boolean isAvailable() {
return !isClosed() && !isClosing();
}
@Override
public boolean isSplittable() {
return splitPolicy.canSplit();
}
@Override
public boolean isMergeable() {
if (!isAvailable()) {
LOG.debug("Region " + this + " is not mergeable because it is closing or closed");
return false;
}
if (hasReferences()) {
LOG.debug("Region " + this + " is not mergeable because it has references");
return false;
}
return true;
}
public boolean areWritesEnabled() {
synchronized (this.writestate) {
return this.writestate.writesEnabled;
}
}
public MultiVersionConcurrencyControl getMVCC() {
return mvcc;
}
@Override
public long getMaxFlushedSeqId() {
return maxFlushedSeqId;
}
/** Returns readpoint considering given IsolationLevel. Pass {@code null} for default */
public long getReadPoint(IsolationLevel isolationLevel) {
if (isolationLevel != null && isolationLevel == IsolationLevel.READ_UNCOMMITTED) {
// This scan can read even uncommitted transactions
return Long.MAX_VALUE;
}
return mvcc.getReadPoint();
}
public boolean isLoadingCfsOnDemandDefault() {
return this.isLoadingCfsOnDemandDefault;
}
/**
* Close down this HRegion. Flush the cache, shut down each HStore, don't service any more calls.
*
* This method could take some time to execute, so don't call it from a time-sensitive thread.
* @return Vector of all the storage files that the HRegion's component HStores make use of. It's
* a list of all StoreFile objects. Returns empty vector if already closed and null if
* judged that it should not close.
* @throws IOException e
* @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was
* not properly persisted. The region is put in closing mode, and
* the caller MUST abort after this.
*/
public Map> close() throws IOException {
return close(false);
}
private final Object closeLock = new Object();
/** Conf key for fair locking policy */
public static final String FAIR_REENTRANT_CLOSE_LOCK =
"hbase.regionserver.fair.region.close.lock";
public static final boolean DEFAULT_FAIR_REENTRANT_CLOSE_LOCK = true;
/** Conf key for the periodic flush interval */
public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL =
"hbase.regionserver.optionalcacheflushinterval";
/** Default interval for the memstore flush */
public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000;
/** Default interval for System tables memstore flush */
public static final int SYSTEM_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes
/** Conf key to force a flush if there are already enough changes for one region in memstore */
public static final String MEMSTORE_FLUSH_PER_CHANGES = "hbase.regionserver.flush.per.changes";
public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions
/**
* The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes
* overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region
*/
public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G
public static final String CLOSE_WAIT_ABORT = "hbase.regionserver.close.wait.abort";
public static final boolean DEFAULT_CLOSE_WAIT_ABORT = false;
public static final String CLOSE_WAIT_TIME = "hbase.regionserver.close.wait.time.ms";
public static final long DEFAULT_CLOSE_WAIT_TIME = 60000; // 1 minute
public static final String CLOSE_WAIT_INTERVAL = "hbase.regionserver.close.wait.interval.ms";
public static final long DEFAULT_CLOSE_WAIT_INTERVAL = 10000; // 10 seconds
/**
* Close down this HRegion. Flush the cache unless abort parameter is true, Shut down each HStore,
* don't service any more calls. This method could take some time to execute, so don't call it
* from a time-sensitive thread.
* @param abort true if server is aborting (only during testing)
* @return Vector of all the storage files that the HRegion's component HStores make use of. It's
* a list of StoreFile objects. Can be null if we are not to close at this time or we are
* already closed.
* @throws IOException e
* @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was
* not properly persisted. The region is put in closing mode, and
* the caller MUST abort after this.
*/
public Map> close(boolean abort) throws IOException {
// Only allow one thread to close at a time. Serialize them so dual
// threads attempting to close will run up against each other.
MonitoredTask status = TaskMonitor.get().createStatus(
"Closing region " + this.getRegionInfo().getEncodedName() + (abort ? " due to abort" : ""),
false, true);
status.setStatus("Waiting for close lock");
try {
synchronized (closeLock) {
return doClose(abort, status);
}
} finally {
if (LOG.isDebugEnabled()) {
LOG.debug("Region close journal for {}:\n{}", this.getRegionInfo().getEncodedName(),
status.prettyPrintJournal());
}
status.cleanup();
}
}
/**
* Exposed for some very specific unit tests.
*/
public void setClosing(boolean closing) {
this.closing.set(closing);
}
/**
* The {@link HRegion#doClose} will block forever if someone tries proving the dead lock via the
* unit test. Instead of blocking, the {@link HRegion#doClose} will throw exception if you set the
* timeout.
* @param timeoutForWriteLock the second time to wait for the write lock in
* {@link HRegion#doClose}
*/
public void setTimeoutForWriteLock(long timeoutForWriteLock) {
assert timeoutForWriteLock >= 0;
this.timeoutForWriteLock = timeoutForWriteLock;
}
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "UL_UNRELEASED_LOCK_EXCEPTION_PATH",
justification = "I think FindBugs is confused")
private Map> doClose(boolean abort, MonitoredTask status)
throws IOException {
if (isClosed()) {
LOG.warn("Region " + this + " already closed");
return null;
}
if (coprocessorHost != null) {
status.setStatus("Running coprocessor pre-close hooks");
this.coprocessorHost.preClose(abort);
}
status.setStatus("Disabling compacts and flushes for region");
boolean canFlush = true;
synchronized (writestate) {
// Disable compacting and flushing by background threads for this
// region.
canFlush = !writestate.readOnly;
writestate.writesEnabled = false;
LOG.debug("Closing {}, disabling compactions & flushes",
this.getRegionInfo().getEncodedName());
waitForFlushesAndCompactions();
}
// If we were not just flushing, is it worth doing a preflush...one
// that will clear out of the bulk of the memstore before we put up
// the close flag?
if (!abort && worthPreFlushing() && canFlush) {
status.setStatus("Pre-flushing region before close");
LOG.info("Running close preflush of {}", this.getRegionInfo().getEncodedName());
try {
internalFlushcache(status);
} catch (IOException ioe) {
// Failed to flush the region. Keep going.
status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage());
}
}
// Set the closing flag
// From this point new arrivals at the region lock will get NSRE.
this.closing.set(true);
LOG.info("Closing region {}", this);
// Acquire the close lock
// The configuration parameter CLOSE_WAIT_ABORT is overloaded to enable both
// the new regionserver abort condition and interrupts for running requests.
// If CLOSE_WAIT_ABORT is not enabled there is no change from earlier behavior,
// we will not attempt to interrupt threads servicing requests nor crash out
// the regionserver if something remains stubborn.
final boolean canAbort = conf.getBoolean(CLOSE_WAIT_ABORT, DEFAULT_CLOSE_WAIT_ABORT);
boolean useTimedWait = false;
if (timeoutForWriteLock != null && timeoutForWriteLock != Long.MAX_VALUE) {
// convert legacy use of timeoutForWriteLock in seconds to new use in millis
timeoutForWriteLock = TimeUnit.SECONDS.toMillis(timeoutForWriteLock);
useTimedWait = true;
} else if (canAbort) {
timeoutForWriteLock = conf.getLong(CLOSE_WAIT_TIME, DEFAULT_CLOSE_WAIT_TIME);
useTimedWait = true;
}
if (LOG.isDebugEnabled()) {
LOG.debug((useTimedWait ? "Time limited wait" : "Waiting without time limit")
+ " for close lock on " + this);
}
final long closeWaitInterval = conf.getLong(CLOSE_WAIT_INTERVAL, DEFAULT_CLOSE_WAIT_INTERVAL);
long elapsedWaitTime = 0;
if (useTimedWait) {
// Sanity check configuration
long remainingWaitTime = timeoutForWriteLock;
if (remainingWaitTime < closeWaitInterval) {
LOG.warn("Time limit for close wait of " + timeoutForWriteLock
+ " ms is less than the configured lock acquisition wait interval " + closeWaitInterval
+ " ms, using wait interval as time limit");
remainingWaitTime = closeWaitInterval;
}
boolean acquired = false;
do {
long start = EnvironmentEdgeManager.currentTime();
try {
acquired = lock.writeLock().tryLock(Math.min(remainingWaitTime, closeWaitInterval),
TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
// Interrupted waiting for close lock. More likely the server is shutting down, not
// normal operation, so aborting upon interrupt while waiting on this lock would not
// provide much value. Throw an IOE (as IIOE) like we would in the case where we
// fail to acquire the lock.
String msg = "Interrupted while waiting for close lock on " + this;
LOG.warn(msg, e);
throw (InterruptedIOException) new InterruptedIOException(msg).initCause(e);
}
long elapsed = EnvironmentEdgeManager.currentTime() - start;
elapsedWaitTime += elapsed;
remainingWaitTime -= elapsed;
if (canAbort && !acquired && remainingWaitTime > 0) {
// Before we loop to wait again, interrupt all region operations that might
// still be in progress, to encourage them to break out of waiting states or
// inner loops, throw an exception to clients, and release the read lock via
// endRegionOperation.
if (LOG.isDebugEnabled()) {
LOG.debug("Interrupting region operations after waiting for close lock for "
+ elapsedWaitTime + " ms on " + this + ", " + remainingWaitTime + " ms remaining");
}
interruptRegionOperations();
}
} while (!acquired && remainingWaitTime > 0);
// If we fail to acquire the lock, trigger an abort if we can; otherwise throw an IOE
// to let the caller know we could not proceed with the close.
if (!acquired) {
String msg =
"Failed to acquire close lock on " + this + " after waiting " + elapsedWaitTime + " ms";
LOG.error(msg);
if (canAbort) {
// If we failed to acquire the write lock, abort the server
rsServices.abort(msg, null);
}
throw new IOException(msg);
}
} else {
long start = EnvironmentEdgeManager.currentTime();
lock.writeLock().lock();
elapsedWaitTime = EnvironmentEdgeManager.currentTime() - start;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Acquired close lock on " + this + " after waiting " + elapsedWaitTime + " ms");
}
status.setStatus("Disabling writes for close");
try {
if (this.isClosed()) {
status.abort("Already got closed by another process");
// SplitTransaction handles the null
return null;
}
LOG.debug("Updates disabled for region " + this);
// Don't flush the cache if we are aborting
if (!abort && canFlush) {
int failedfFlushCount = 0;
int flushCount = 0;
long tmp = 0;
long remainingSize = this.memStoreSizing.getDataSize();
while (remainingSize > 0) {
try {
internalFlushcache(status);
if (flushCount > 0) {
LOG.info("Running extra flush, " + flushCount + " (carrying snapshot?) " + this);
}
flushCount++;
tmp = this.memStoreSizing.getDataSize();
if (tmp >= remainingSize) {
failedfFlushCount++;
}
remainingSize = tmp;
if (failedfFlushCount > 5) {
// If we failed 5 times and are unable to clear memory, abort
// so we do not lose data
throw new DroppedSnapshotException("Failed clearing memory after " + flushCount
+ " attempts on region: " + Bytes.toStringBinary(getRegionInfo().getRegionName()));
}
} catch (IOException ioe) {
status.setStatus("Failed flush " + this + ", putting online again");
synchronized (writestate) {
writestate.writesEnabled = true;
}
// Have to throw to upper layers. I can't abort server from here.
throw ioe;
}
}
}
Map> result = new TreeMap<>(Bytes.BYTES_COMPARATOR);
if (!stores.isEmpty()) {
// initialize the thread pool for closing stores in parallel.
ThreadPoolExecutor storeCloserThreadPool =
getStoreOpenAndCloseThreadPool("StoreCloser-" + getRegionInfo().getRegionNameAsString());
CompletionService>> completionService =
new ExecutorCompletionService<>(storeCloserThreadPool);
// close each store in parallel
for (HStore store : stores.values()) {
MemStoreSize mss = store.getFlushableSize();
if (!(abort || mss.getDataSize() == 0 || writestate.readOnly)) {
if (getRegionServerServices() != null) {
getRegionServerServices().abort("Assertion failed while closing store "
+ getRegionInfo().getRegionNameAsString() + " " + store
+ ". flushableSize expected=0, actual={" + mss + "}. Current memStoreSize="
+ this.memStoreSizing.getMemStoreSize() + ". Maybe a coprocessor "
+ "operation failed and left the memstore in a partially updated state.", null);
}
}
completionService.submit(new Callable>>() {
@Override
public Pair> call() throws IOException {
return new Pair<>(store.getColumnFamilyDescriptor().getName(), store.close());
}
});
}
try {
for (int i = 0; i < stores.size(); i++) {
Future>> future = completionService.take();
Pair> storeFiles = future.get();
List familyFiles = result.get(storeFiles.getFirst());
if (familyFiles == null) {
familyFiles = new ArrayList<>();
result.put(storeFiles.getFirst(), familyFiles);
}
familyFiles.addAll(storeFiles.getSecond());
}
} catch (InterruptedException e) {
throw throwOnInterrupt(e);
} catch (ExecutionException e) {
Throwable cause = e.getCause();
if (cause instanceof IOException) {
throw (IOException) cause;
}
throw new IOException(cause);
} finally {
storeCloserThreadPool.shutdownNow();
}
}
status.setStatus("Writing region close event to WAL");
// Always write close marker to wal even for read only table. This is not a big problem as we
// do not write any data into the region; it is just a meta edit in the WAL file.
if (
!abort && wal != null && getRegionServerServices() != null
&& RegionReplicaUtil.isDefaultReplica(getRegionInfo())
) {
writeRegionCloseMarker(wal);
}
this.closed.set(true);
if (!canFlush) {
decrMemStoreSize(this.memStoreSizing.getMemStoreSize());
} else if (this.memStoreSizing.getDataSize() != 0) {
LOG.error("Memstore data size is {} in region {}", this.memStoreSizing.getDataSize(), this);
}
if (coprocessorHost != null) {
status.setStatus("Running coprocessor post-close hooks");
this.coprocessorHost.postClose(abort);
}
if (this.metricsRegion != null) {
this.metricsRegion.close();
}
if (this.metricsRegionWrapper != null) {
Closeables.close(this.metricsRegionWrapper, true);
}
status.markComplete("Closed");
LOG.info("Closed {}", this);
return result;
} finally {
lock.writeLock().unlock();
}
}
/** Wait for all current flushes and compactions of the region to complete */
// TODO HBASE-18906. Check the usage (if any) in Phoenix and expose this or give alternate way for
// Phoenix needs.
public void waitForFlushesAndCompactions() {
synchronized (writestate) {
if (this.writestate.readOnly) {
// we should not wait for replayed flushed if we are read only (for example in case the
// region is a secondary replica).
return;
}
boolean interrupted = false;
try {
while (writestate.compacting.get() > 0 || writestate.flushing) {
LOG.debug("waiting for " + writestate.compacting + " compactions"
+ (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this);
try {
writestate.wait();
} catch (InterruptedException iex) {
// essentially ignore and propagate the interrupt back up
LOG.warn("Interrupted while waiting in region {}", this);
interrupted = true;
break;
}
}
} finally {
if (interrupted) {
Thread.currentThread().interrupt();
}
}
}
}
/**
* Wait for all current flushes of the region to complete
*/
public void waitForFlushes() {
waitForFlushes(0);// Unbound wait
}
@Override
public boolean waitForFlushes(long timeout) {
synchronized (writestate) {
if (this.writestate.readOnly) {
// we should not wait for replayed flushed if we are read only (for example in case the
// region is a secondary replica).
return true;
}
if (!writestate.flushing) return true;
long start = EnvironmentEdgeManager.currentTime();
long duration = 0;
boolean interrupted = false;
LOG.debug("waiting for cache flush to complete for region " + this);
try {
while (writestate.flushing) {
if (timeout > 0 && duration >= timeout) break;
try {
long toWait = timeout == 0 ? 0 : (timeout - duration);
writestate.wait(toWait);
} catch (InterruptedException iex) {
// essentially ignore and propagate the interrupt back up
LOG.warn("Interrupted while waiting in region {}", this);
interrupted = true;
break;
} finally {
duration = EnvironmentEdgeManager.currentTime() - start;
}
}
} finally {
if (interrupted) {
Thread.currentThread().interrupt();
}
}
LOG.debug("Waited {} ms for region {} flush to complete", duration, this);
return !(writestate.flushing);
}
}
@Override
public Configuration getReadOnlyConfiguration() {
return new ReadOnlyConfiguration(this.conf);
}
private ThreadPoolExecutor getStoreOpenAndCloseThreadPool(final String threadNamePrefix) {
int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount());
int maxThreads = Math.min(numStores, conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX));
return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
}
ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(final String threadNamePrefix) {
int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount());
int maxThreads = Math.max(1, conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX) / numStores);
return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
}
private static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads,
final String threadNamePrefix) {
return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS,
new ThreadFactory() {
private int count = 1;
@Override
public Thread newThread(Runnable r) {
return new Thread(r, threadNamePrefix + "-" + count++);
}
});
}
/** Returns True if its worth doing a flush before we put up the close flag. */
private boolean worthPreFlushing() {
return this.memStoreSizing.getDataSize()
> this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
}
//////////////////////////////////////////////////////////////////////////////
// HRegion accessors
//////////////////////////////////////////////////////////////////////////////
@Override
public TableDescriptor getTableDescriptor() {
return this.htableDescriptor;
}
@RestrictedApi(explanation = "Should only be called in tests", link = "",
allowedOnPath = ".*/src/test/.*")
public void setTableDescriptor(TableDescriptor desc) {
htableDescriptor = desc;
}
/** Returns WAL in use for this region */
public WAL getWAL() {
return this.wal;
}
public BlockCache getBlockCache() {
return this.blockCache;
}
/**
* Only used for unit test which doesn't start region server.
*/
public void setBlockCache(BlockCache blockCache) {
this.blockCache = blockCache;
}
public MobFileCache getMobFileCache() {
return this.mobFileCache;
}
/**
* Only used for unit test which doesn't start region server.
*/
public void setMobFileCache(MobFileCache mobFileCache) {
this.mobFileCache = mobFileCache;
}
/** Returns split policy for this region. */
RegionSplitPolicy getSplitPolicy() {
return this.splitPolicy;
}
/**
* A split takes the config from the parent region & passes it to the daughter region's
* constructor. If 'conf' was passed, you would end up using the HTD of the parent region in
* addition to the new daughter HTD. Pass 'baseConf' to the daughter regions to avoid this tricky
* dedupe problem.
* @return Configuration object
*/
Configuration getBaseConf() {
return this.baseConf;
}
/** Returns {@link FileSystem} being used by this region */
public FileSystem getFilesystem() {
return fs.getFileSystem();
}
/** Returns the {@link HRegionFileSystem} used by this region */
public HRegionFileSystem getRegionFileSystem() {
return this.fs;
}
/** Returns the WAL {@link HRegionFileSystem} used by this region */
HRegionWALFileSystem getRegionWALFileSystem() throws IOException {
return new HRegionWALFileSystem(conf, getWalFileSystem(),
CommonFSUtils.getWALTableDir(conf, htableDescriptor.getTableName()), fs.getRegionInfo());
}
/** Returns the WAL {@link FileSystem} being used by this region */
FileSystem getWalFileSystem() throws IOException {
if (walFS == null) {
walFS = CommonFSUtils.getWALFileSystem(conf);
}
return walFS;
}
/**
* @return the Region directory under WALRootDirectory
* @throws IOException if there is an error getting WALRootDir
*/
public Path getWALRegionDir() throws IOException {
if (regionWalDir == null) {
regionWalDir = CommonFSUtils.getWALRegionDir(conf, getRegionInfo().getTable(),
getRegionInfo().getEncodedName());
}
return regionWalDir;
}
@Override
public long getEarliestFlushTimeForAllStores() {
return Collections.min(lastStoreFlushTimeMap.values());
}
@Override
public long getOldestHfileTs(boolean majorCompactionOnly) throws IOException {
long result = Long.MAX_VALUE;
for (HStore store : stores.values()) {
Collection storeFiles = store.getStorefiles();
if (storeFiles == null) {
continue;
}
for (HStoreFile file : storeFiles) {
StoreFileReader sfReader = file.getReader();
if (sfReader == null) {
continue;
}
HFile.Reader reader = sfReader.getHFileReader();
if (reader == null) {
continue;
}
if (majorCompactionOnly) {
byte[] val = reader.getHFileInfo().get(MAJOR_COMPACTION_KEY);
if (val == null || !Bytes.toBoolean(val)) {
continue;
}
}
result = Math.min(result, reader.getFileContext().getFileCreateTime());
}
}
return result == Long.MAX_VALUE ? 0 : result;
}
RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) {
long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId;
byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes();
regionLoadBldr.clearStoreCompleteSequenceId();
for (byte[] familyName : this.stores.keySet()) {
long earliest = this.wal.getEarliestMemStoreSeqNum(encodedRegionName, familyName);
// Subtract - 1 to go earlier than the current oldest, unflushed edit in memstore; this will
// give us a sequence id that is for sure flushed. We want edit replay to start after this
// sequence id in this region. If NO_SEQNUM, use the regions maximum flush id.
long csid = (earliest == HConstants.NO_SEQNUM) ? lastFlushOpSeqIdLocal : earliest - 1;
regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId.newBuilder()
.setFamilyName(UnsafeByteOperations.unsafeWrap(familyName)).setSequenceId(csid).build());
}
return regionLoadBldr.setCompleteSequenceId(getMaxFlushedSeqId());
}
//////////////////////////////////////////////////////////////////////////////
// HRegion maintenance.
//
// These methods are meant to be called periodically by the HRegionServer for
// upkeep.
//////////////////////////////////////////////////////////////////////////////
/**
* Do preparation for pending compaction.
*/
protected void doRegionCompactionPrep() throws IOException {
}
/**
* Synchronously compact all stores in the region.
*
* This operation could block for a long time, so don't call it from a time-sensitive thread.
*
* Note that no locks are taken to prevent possible conflicts between compaction and splitting
* activities. The regionserver does not normally compact and split in parallel. However by
* calling this method you may introduce unexpected and unhandled concurrency. Don't do this
* unless you know what you are doing.
* @param majorCompaction True to force a major compaction regardless of thresholds
*/
public void compact(boolean majorCompaction) throws IOException {
if (majorCompaction) {
stores.values().forEach(HStore::triggerMajorCompaction);
}
for (HStore s : stores.values()) {
Optional compaction = s.requestCompaction();
if (compaction.isPresent()) {
ThroughputController controller = null;
if (rsServices != null) {
controller = CompactionThroughputControllerFactory.create(rsServices, conf);
}
if (controller == null) {
controller = NoLimitThroughputController.INSTANCE;
}
compact(compaction.get(), s, controller, null);
}
}
}
/**
* This is a helper function that compact all the stores synchronously.
*
* It is used by utilities and testing
*/
public void compactStores() throws IOException {
for (HStore s : stores.values()) {
Optional compaction = s.requestCompaction();
if (compaction.isPresent()) {
compact(compaction.get(), s, NoLimitThroughputController.INSTANCE, null);
}
}
}
/**
* This is a helper function that compact the given store.
*
* It is used by utilities and testing
*/
void compactStore(byte[] family, ThroughputController throughputController) throws IOException {
HStore s = getStore(family);
Optional compaction = s.requestCompaction();
if (compaction.isPresent()) {
compact(compaction.get(), s, throughputController, null);
}
}
/**
* Called by compaction thread and after region is opened to compact the HStores if necessary.
*
* This operation could block for a long time, so don't call it from a time-sensitive thread. Note
* that no locking is necessary at this level because compaction only conflicts with a region
* split, and that cannot happen because the region server does them sequentially and not in
* parallel.
* @param compaction Compaction details, obtained by requestCompaction()
* @return whether the compaction completed
*/
public boolean compact(CompactionContext compaction, HStore store,
ThroughputController throughputController) throws IOException {
return compact(compaction, store, throughputController, null);
}
public boolean compact(CompactionContext compaction, HStore store,
ThroughputController throughputController, User user) throws IOException {
assert compaction != null && compaction.hasSelection();
assert !compaction.getRequest().getFiles().isEmpty();
if (this.closing.get() || this.closed.get()) {
LOG.debug("Skipping compaction on " + this + " because closing/closed");
store.cancelRequestedCompaction(compaction);
return false;
}
MonitoredTask status = null;
boolean requestNeedsCancellation = true;
/*
* We are trying to remove / relax the region read lock for compaction. Let's see what are the
* potential race conditions among the operations (user scan, region split, region close and
* region bulk load). user scan ---> region read lock region split --> region close first -->
* region write lock region close --> region write lock region bulk load --> region write lock
* read lock is compatible with read lock. ---> no problem with user scan/read region bulk load
* does not cause problem for compaction (no consistency problem, store lock will help the store
* file accounting). They can run almost concurrently at the region level. The only remaining
* race condition is between the region close and compaction. So we will evaluate, below, how
* region close intervenes with compaction if compaction does not acquire region read lock. Here
* are the steps for compaction: 1. obtain list of StoreFile's 2. create StoreFileScanner's
* based on list from #1 3. perform compaction and save resulting files under tmp dir 4. swap in
* compacted files #1 is guarded by store lock. This patch does not change this --> no worse or
* better For #2, we obtain smallest read point (for region) across all the Scanners (for both
* default compactor and stripe compactor). The read points are for user scans. Region keeps the
* read points for all currently open user scanners. Compaction needs to know the smallest read
* point so that during re-write of the hfiles, it can remove the mvcc points for the cells if
* their mvccs are older than the smallest since they are not needed anymore. This will not
* conflict with compaction. For #3, it can be performed in parallel to other operations. For #4
* bulk load and compaction don't conflict with each other on the region level (for multi-family
* atomicy). Region close and compaction are guarded pretty well by the 'writestate'. In
* HRegion#doClose(), we have : synchronized (writestate) { // Disable compacting and flushing
* by background threads for this // region. canFlush = !writestate.readOnly;
* writestate.writesEnabled = false; LOG.debug("Closing " + this +
* ": disabling compactions & flushes"); waitForFlushesAndCompactions(); }
* waitForFlushesAndCompactions() would wait for writestate.compacting to come down to 0. and in
* HRegion.compact() try { synchronized (writestate) { if (writestate.writesEnabled) {
* wasStateSet = true; ++writestate.compacting; } else { String msg = "NOT compacting region " +
* this + ". Writes disabled."; LOG.info(msg); status.abort(msg); return false; } } Also in
* compactor.performCompaction(): check periodically to see if a system stop is requested if
* (closeChecker != null && closeChecker.isTimeLimit(store, now)) { progress.cancel(); return
* false; } if (closeChecker != null && closeChecker.isSizeLimit(store, len)) {
* progress.cancel(); return false; }
*/
try {
byte[] cf = Bytes.toBytes(store.getColumnFamilyName());
if (stores.get(cf) != store) {
LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this
+ " has been re-instantiated, cancel this compaction request. "
+ " It may be caused by the roll back of split transaction");
return false;
}
status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this);
if (this.closed.get()) {
String msg = "Skipping compaction on " + this + " because closed";
LOG.debug(msg);
status.abort(msg);
return false;
}
boolean wasStateSet = false;
try {
synchronized (writestate) {
if (writestate.writesEnabled) {
wasStateSet = true;
writestate.compacting.incrementAndGet();
} else {
String msg = "NOT compacting region " + this + ". Writes disabled.";
LOG.info(msg);
status.abort(msg);
return false;
}
}
LOG.info("Starting compaction of {} in {}{}", store, this,
(compaction.getRequest().isOffPeak() ? " as an off-peak compaction" : ""));
doRegionCompactionPrep();
try {
status.setStatus("Compacting store " + store);
// We no longer need to cancel the request on the way out of this
// method because Store#compact will clean up unconditionally
requestNeedsCancellation = false;
store.compact(compaction, throughputController, user);
} catch (InterruptedIOException iioe) {
String msg = "region " + this + " compaction interrupted";
LOG.info(msg, iioe);
status.abort(msg);
return false;
}
} finally {
if (wasStateSet) {
synchronized (writestate) {
writestate.compacting.decrementAndGet();
if (writestate.compacting.get() <= 0) {
writestate.notifyAll();
}
}
}
}
status.markComplete("Compaction complete");
return true;
} finally {
if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
if (status != null) {
LOG.debug("Compaction status journal for {}:\n{}", this.getRegionInfo().getEncodedName(),
status.prettyPrintJournal());
status.cleanup();
}
}
}
/**
* Flush the cache.
*
* When this method is called the cache will be flushed unless:
*
* - the cache is empty
* - the region is closed.
* - a flush is already in progress
* - writes are disabled
*
*
* This method may block for some time, so it should not be called from a time-sensitive thread.
* @param flushAllStores whether we want to force a flush of all stores
* @return FlushResult indicating whether the flush was successful or not and if the region needs
* compacting
* @throws IOException general io exceptions because a snapshot was not properly persisted.
*/
// TODO HBASE-18905. We might have to expose a requestFlush API for CPs
public FlushResult flush(boolean flushAllStores) throws IOException {
return flushcache(flushAllStores, false, FlushLifeCycleTracker.DUMMY);
}
public interface FlushResult {
enum Result {
FLUSHED_NO_COMPACTION_NEEDED,
FLUSHED_COMPACTION_NEEDED,
// Special case where a flush didn't run because there's nothing in the memstores. Used when
// bulk loading to know when we can still load even if a flush didn't happen.
CANNOT_FLUSH_MEMSTORE_EMPTY,
CANNOT_FLUSH
}
/** Returns the detailed result code */
Result getResult();
/** Returns true if the memstores were flushed, else false */
boolean isFlushSucceeded();
/** Returns True if the flush requested a compaction, else false */
boolean isCompactionNeeded();
}
FlushResultImpl flushcache(boolean flushAllStores, boolean writeFlushRequestWalMarker,
FlushLifeCycleTracker tracker) throws IOException {
List families = null;
if (flushAllStores) {
families = new ArrayList<>();
families.addAll(this.getTableDescriptor().getColumnFamilyNames());
}
return this.flushcache(families, writeFlushRequestWalMarker, tracker);
}
/**
* Flush the cache. When this method is called the cache will be flushed unless:
*
* - the cache is empty
* - the region is closed.
* - a flush is already in progress
* - writes are disabled
*
*
* This method may block for some time, so it should not be called from a time-sensitive thread.
* @param families stores of region to flush.
* @param writeFlushRequestWalMarker whether to write the flush request marker to WAL
* @param tracker used to track the life cycle of this flush
* @return whether the flush is success and whether the region needs compacting
* @throws IOException general io exceptions
* @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was
* not properly persisted. The region is put in closing mode, and
* the caller MUST abort after this.
*/
public FlushResultImpl flushcache(List families, boolean writeFlushRequestWalMarker,
FlushLifeCycleTracker tracker) throws IOException {
// fail-fast instead of waiting on the lock
if (this.closing.get()) {
String msg = "Skipping flush on " + this + " because closing";
LOG.debug(msg);
return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
}
MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
status.setStatus("Acquiring readlock on region");
// block waiting for the lock for flushing cache
lock.readLock().lock();
boolean flushed = true;
try {
if (this.closed.get()) {
String msg = "Skipping flush on " + this + " because closed";
LOG.debug(msg);
status.abort(msg);
flushed = false;
return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
}
if (coprocessorHost != null) {
status.setStatus("Running coprocessor pre-flush hooks");
coprocessorHost.preFlush(tracker);
}
// TODO: this should be managed within memstore with the snapshot, updated only after flush
// successful
if (numMutationsWithoutWAL.sum() > 0) {
numMutationsWithoutWAL.reset();
dataInMemoryWithoutWAL.reset();
}
synchronized (writestate) {
if (!writestate.flushing && writestate.writesEnabled) {
this.writestate.flushing = true;
} else {
String msg = "NOT flushing " + this + " as "
+ (writestate.flushing ? "already flushing" : "writes are not enabled");
LOG.debug(msg);
status.abort(msg);
flushed = false;
return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
}
}
try {
// The reason that we do not always use flushPolicy is, when the flush is
// caused by logRoller, we should select stores which must be flushed
// rather than could be flushed.
Collection specificStoresToFlush = null;
if (families != null) {
specificStoresToFlush = getSpecificStores(families);
} else {
specificStoresToFlush = flushPolicy.selectStoresToFlush();
}
FlushResultImpl fs =
internalFlushcache(specificStoresToFlush, status, writeFlushRequestWalMarker, tracker);
if (coprocessorHost != null) {
status.setStatus("Running post-flush coprocessor hooks");
coprocessorHost.postFlush(tracker);
}
if (fs.isFlushSucceeded()) {
flushesQueued.reset();
}
status.markComplete("Flush successful " + fs.toString());
return fs;
} finally {
synchronized (writestate) {
writestate.flushing = false;
this.writestate.flushRequested = false;
writestate.notifyAll();
}
}
} finally {
lock.readLock().unlock();
if (flushed) {
// Don't log this journal stuff if no flush -- confusing.
LOG.debug("Flush status journal for {}:\n{}", this.getRegionInfo().getEncodedName(),
status.prettyPrintJournal());
}
status.cleanup();
}
}
/**
* get stores which matches the specified families
* @return the stores need to be flushed.
*/
private Collection getSpecificStores(List families) {
Collection specificStoresToFlush = new ArrayList<>();
for (byte[] family : families) {
specificStoresToFlush.add(stores.get(family));
}
return specificStoresToFlush;
}
/**
* Should the store be flushed because it is old enough.
*
* Every FlushPolicy should call this to determine whether a store is old enough to flush (except
* that you always flush all stores). Otherwise the method will always returns true which will
* make a lot of flush requests.
*/
boolean shouldFlushStore(HStore store) {
long earliest = this.wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(),
store.getColumnFamilyDescriptor().getName()) - 1;
if (earliest > 0 && earliest + flushPerChanges < mvcc.getReadPoint()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Flush column family " + store.getColumnFamilyName() + " of "
+ getRegionInfo().getEncodedName() + " because unflushed sequenceid=" + earliest
+ " is > " + this.flushPerChanges + " from current=" + mvcc.getReadPoint());
}
return true;
}
if (this.flushCheckInterval <= 0) {
return false;
}
long now = EnvironmentEdgeManager.currentTime();
if (store.timeOfOldestEdit() < now - this.flushCheckInterval) {
if (LOG.isDebugEnabled()) {
LOG.debug("Flush column family: " + store.getColumnFamilyName() + " of "
+ getRegionInfo().getEncodedName() + " because time of oldest edit="
+ store.timeOfOldestEdit() + " is > " + this.flushCheckInterval + " from now =" + now);
}
return true;
}
return false;
}
/**
* Should the memstore be flushed now
*/
boolean shouldFlush(final StringBuilder whyFlush) {
whyFlush.setLength(0);
// This is a rough measure.
if (
this.maxFlushedSeqId > 0
&& (this.maxFlushedSeqId + this.flushPerChanges < this.mvcc.getReadPoint())
) {
whyFlush.append("more than max edits, " + this.flushPerChanges + ", since last flush");
return true;
}
long modifiedFlushCheckInterval = flushCheckInterval;
if (
getRegionInfo().getTable().isSystemTable()
&& getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID
) {
modifiedFlushCheckInterval = SYSTEM_CACHE_FLUSH_INTERVAL;
}
if (modifiedFlushCheckInterval <= 0) { // disabled
return false;
}
long now = EnvironmentEdgeManager.currentTime();
// if we flushed in the recent past, we don't need to do again now
if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) {
return false;
}
// since we didn't flush in the recent past, flush now if certain conditions
// are met. Return true on first such memstore hit.
for (HStore s : stores.values()) {
if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) {
// we have an old enough edit in the memstore, flush
whyFlush.append(s.toString() + " has an old edit so flush to free WALs");
return true;
}
}
return false;
}
/**
* Flushing all stores.
* @see #internalFlushcache(Collection, MonitoredTask, boolean, FlushLifeCycleTracker)
*/
private FlushResult internalFlushcache(MonitoredTask status) throws IOException {
return internalFlushcache(stores.values(), status, false, FlushLifeCycleTracker.DUMMY);
}
/**
* Flushing given stores.
* @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean, FlushLifeCycleTracker)
*/
private FlushResultImpl internalFlushcache(Collection storesToFlush, MonitoredTask status,
boolean writeFlushWalMarker, FlushLifeCycleTracker tracker) throws IOException {
return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush, status,
writeFlushWalMarker, tracker);
}
/**
* Flush the memstore. Flushing the memstore is a little tricky. We have a lot of updates in the
* memstore, all of which have also been written to the wal. We need to write those updates in the
* memstore out to disk, while being able to process reads/writes as much as possible during the
* flush operation.
*
* This method may block for some time. Every time you call it, we up the regions sequence id even
* if we don't flush; i.e. the returned region id will be at least one larger than the last edit
* applied to this region. The returned id does not refer to an actual edit. The returned id can
* be used for say installing a bulk loaded file just ahead of the last hfile that was the result
* of this flush, etc.
* @param wal Null if we're NOT to go via wal.
* @param myseqid The seqid to use if wal
is null writing out flush file.
* @param storesToFlush The list of stores to flush.
* @return object describing the flush's state
* @throws IOException general io exceptions
* @throws DroppedSnapshotException Thrown when replay of WAL is required.
*/
protected FlushResultImpl internalFlushcache(WAL wal, long myseqid,
Collection storesToFlush, MonitoredTask status, boolean writeFlushWalMarker,
FlushLifeCycleTracker tracker) throws IOException {
PrepareFlushResult result =
internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker, tracker);
if (result.result == null) {
return internalFlushCacheAndCommit(wal, status, result, storesToFlush);
} else {
return result.result; // early exit due to failure from prepare stage
}
}
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "DLS_DEAD_LOCAL_STORE",
justification = "FindBugs seems confused about trxId")
protected PrepareFlushResult internalPrepareFlushCache(WAL wal, long myseqid,
Collection storesToFlush, MonitoredTask status, boolean writeFlushWalMarker,
FlushLifeCycleTracker tracker) throws IOException {
if (this.rsServices != null && this.rsServices.isAborted()) {
// Don't flush when server aborting, it's unsafe
throw new IOException("Aborting flush because server is aborted...");
}
final long startTime = EnvironmentEdgeManager.currentTime();
// If nothing to flush, return, but return with a valid unused sequenceId.
// Its needed by bulk upload IIRC. It flushes until no edits in memory so it can insert a
// bulk loaded file between memory and existing hfiles. It wants a good seqeunceId that belongs
// to no other that it can use to associate with the bulk load. Hence this little dance below
// to go get one.
if (this.memStoreSizing.getDataSize() <= 0) {
// Take an update lock so no edits can come into memory just yet.
this.updatesLock.writeLock().lock();
WriteEntry writeEntry = null;
try {
if (this.memStoreSizing.getDataSize() <= 0) {
// Presume that if there are still no edits in the memstore, then there are no edits for
// this region out in the WAL subsystem so no need to do any trickery clearing out
// edits in the WAL sub-system. Up the sequence number so the resulting flush id is for
// sure just beyond the last appended region edit and not associated with any edit
// (useful as marker when bulk loading, etc.).
if (wal != null) {
writeEntry = mvcc.begin();
long flushOpSeqId = writeEntry.getWriteNumber();
FlushResultImpl flushResult =
new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, flushOpSeqId,
"Nothing to flush", writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker));
mvcc.completeAndWait(writeEntry);
// Set to null so we don't complete it again down in finally block.
writeEntry = null;
return new PrepareFlushResult(flushResult, myseqid);
} else {
return new PrepareFlushResult(new FlushResultImpl(
FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush", false), myseqid);
}
}
} finally {
if (writeEntry != null) {
// If writeEntry is non-null, this operation failed; the mvcc transaction failed...
// but complete it anyways so it doesn't block the mvcc queue.
mvcc.complete(writeEntry);
}
this.updatesLock.writeLock().unlock();
}
}
logFatLineOnFlush(storesToFlush, myseqid);
// Stop updates while we snapshot the memstore of all of these regions' stores. We only have
// to do this for a moment. It is quick. We also set the memstore size to zero here before we
// allow updates again so its value will represent the size of the updates received
// during flush
// We have to take an update lock during snapshot, or else a write could end up in both snapshot
// and memstore (makes it difficult to do atomic rows then)
status.setStatus("Obtaining lock to block concurrent updates");
// block waiting for the lock for internal flush
this.updatesLock.writeLock().lock();
status.setStatus("Preparing flush snapshotting stores in " + getRegionInfo().getEncodedName());
MemStoreSizing totalSizeOfFlushableStores = new NonThreadSafeMemStoreSizing();
Map flushedFamilyNamesToSeq = new HashMap<>();
for (HStore store : storesToFlush) {
flushedFamilyNamesToSeq.put(store.getColumnFamilyDescriptor().getName(),
store.preFlushSeqIDEstimation());
}
TreeMap storeFlushCtxs = new TreeMap<>(Bytes.BYTES_COMPARATOR);
TreeMap> committedFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR);
TreeMap storeFlushableSize = new TreeMap<>(Bytes.BYTES_COMPARATOR);
// The sequence id of this flush operation which is used to log FlushMarker and pass to
// createFlushContext to use as the store file's sequence id. It can be in advance of edits
// still in the memstore, edits that are in other column families yet to be flushed.
long flushOpSeqId = HConstants.NO_SEQNUM;
// The max flushed sequence id after this flush operation completes. All edits in memstore
// will be in advance of this sequence id.
long flushedSeqId = HConstants.NO_SEQNUM;
byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes();
try {
if (wal != null) {
Long earliestUnflushedSequenceIdForTheRegion =
wal.startCacheFlush(encodedRegionName, flushedFamilyNamesToSeq);
if (earliestUnflushedSequenceIdForTheRegion == null) {
// This should never happen. This is how startCacheFlush signals flush cannot proceed.
String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing.";
status.setStatus(msg);
return new PrepareFlushResult(
new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false), myseqid);
}
flushOpSeqId = getNextSequenceId(wal);
// Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit
flushedSeqId = earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM
? flushOpSeqId
: earliestUnflushedSequenceIdForTheRegion.longValue() - 1;
} else {
// use the provided sequence Id as WAL is not being used for this flush.
flushedSeqId = flushOpSeqId = myseqid;
}
for (HStore s : storesToFlush) {
storeFlushCtxs.put(s.getColumnFamilyDescriptor().getName(),
s.createFlushContext(flushOpSeqId, tracker));
// for writing stores to WAL
committedFiles.put(s.getColumnFamilyDescriptor().getName(), null);
}
// write the snapshot start to WAL
if (wal != null && !writestate.readOnly) {
FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH,
getRegionInfo(), flushOpSeqId, committedFiles);
// No sync. Sync is below where no updates lock and we do FlushAction.COMMIT_FLUSH
WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false,
mvcc);
}
// Prepare flush (take a snapshot)
storeFlushCtxs.forEach((name, flush) -> {
MemStoreSize snapshotSize = flush.prepare();
totalSizeOfFlushableStores.incMemStoreSize(snapshotSize);
storeFlushableSize.put(name, snapshotSize);
});
} catch (IOException ex) {
doAbortFlushToWAL(wal, flushOpSeqId, committedFiles);
throw ex;
} finally {
this.updatesLock.writeLock().unlock();
}
String s = "Finished memstore snapshotting " + this + ", syncing WAL and waiting on mvcc, "
+ "flushsize=" + totalSizeOfFlushableStores;
status.setStatus(s);
doSyncOfUnflushedWALChanges(wal, getRegionInfo());
return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
flushOpSeqId, flushedSeqId, totalSizeOfFlushableStores);
}
/**
* Utility method broken out of internalPrepareFlushCache so that method is smaller.
*/
private void logFatLineOnFlush(Collection storesToFlush, long sequenceId) {
if (!LOG.isInfoEnabled()) {
return;
}
// Log a fat line detailing what is being flushed.
StringBuilder perCfExtras = null;
if (!isAllFamilies(storesToFlush)) {
perCfExtras = new StringBuilder();
for (HStore store : storesToFlush) {
MemStoreSize mss = store.getFlushableSize();
perCfExtras.append("; ").append(store.getColumnFamilyName());
perCfExtras.append("={dataSize=").append(StringUtils.byteDesc(mss.getDataSize()));
perCfExtras.append(", heapSize=").append(StringUtils.byteDesc(mss.getHeapSize()));
perCfExtras.append(", offHeapSize=").append(StringUtils.byteDesc(mss.getOffHeapSize()));
perCfExtras.append("}");
}
}
MemStoreSize mss = this.memStoreSizing.getMemStoreSize();
LOG.info("Flushing " + this.getRegionInfo().getEncodedName() + " " + storesToFlush.size() + "/"
+ stores.size() + " column families," + " dataSize=" + StringUtils.byteDesc(mss.getDataSize())
+ " heapSize=" + StringUtils.byteDesc(mss.getHeapSize())
+ ((perCfExtras != null && perCfExtras.length() > 0) ? perCfExtras.toString() : "")
+ ((wal != null) ? "" : "; WAL is null, using passed sequenceid=" + sequenceId));
}
private void doAbortFlushToWAL(final WAL wal, final long flushOpSeqId,
final Map> committedFiles) {
if (wal == null) return;
try {
FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
getRegionInfo(), flushOpSeqId, committedFiles);
WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false, mvcc);
} catch (Throwable t) {
LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL: {} in "
+ " region {}", StringUtils.stringifyException(t), this);
// ignore this since we will be aborting the RS with DSE.
}
// we have called wal.startCacheFlush(), now we have to abort it
wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
}
/**
* Sync unflushed WAL changes. See HBASE-8208 for details
*/
private static void doSyncOfUnflushedWALChanges(final WAL wal, final RegionInfo hri)
throws IOException {
if (wal == null) {
return;
}
try {
wal.sync(); // ensure that flush marker is sync'ed
} catch (IOException ioe) {
wal.abortCacheFlush(hri.getEncodedNameAsBytes());
throw ioe;
}
}
/** Returns True if passed Set is all families in the region. */
private boolean isAllFamilies(Collection families) {
return families == null || this.stores.size() == families.size();
}
/**
* Writes a marker to WAL indicating a flush is requested but cannot be complete due to various
* reasons. Ignores exceptions from WAL. Returns whether the write succeeded.
* @return whether WAL write was successful
*/
private boolean writeFlushRequestMarkerToWAL(WAL wal, boolean writeFlushWalMarker) {
if (writeFlushWalMarker && wal != null && !writestate.readOnly) {
FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH,
getRegionInfo(), -1, new TreeMap<>(Bytes.BYTES_COMPARATOR));
try {
WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true,
mvcc);
return true;
} catch (IOException e) {
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Received exception while trying to write the flush request to wal", e);
}
}
return false;
}
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY",
justification = "Intentional; notify is about completed flush")
FlushResultImpl internalFlushCacheAndCommit(WAL wal, MonitoredTask status,
PrepareFlushResult prepareResult, Collection storesToFlush) throws IOException {
// prepare flush context is carried via PrepareFlushResult
TreeMap storeFlushCtxs = prepareResult.storeFlushCtxs;
TreeMap> committedFiles = prepareResult.committedFiles;
long startTime = prepareResult.startTime;
long flushOpSeqId = prepareResult.flushOpSeqId;
long flushedSeqId = prepareResult.flushedSeqId;
String s = "Flushing stores of " + this;
status.setStatus(s);
if (LOG.isTraceEnabled()) LOG.trace(s);
// Any failure from here on out will be catastrophic requiring server
// restart so wal content can be replayed and put back into the memstore.
// Otherwise, the snapshot content while backed up in the wal, it will not
// be part of the current running servers state.
boolean compactionRequested = false;
long flushedOutputFileSize = 0;
try {
// A. Flush memstore to all the HStores.
// Keep running vector of all store files that includes both old and the
// just-made new flush store file. The new flushed file is still in the
// tmp directory.
for (StoreFlushContext flush : storeFlushCtxs.values()) {
flush.flushCache(status);
}
// Switch snapshot (in memstore) -> new hfile (thus causing
// all the store scanners to reset/reseek).
for (Map.Entry flushEntry : storeFlushCtxs.entrySet()) {
StoreFlushContext sfc = flushEntry.getValue();
boolean needsCompaction = sfc.commit(status);
if (needsCompaction) {
compactionRequested = true;
}
byte[] storeName = flushEntry.getKey();
List storeCommittedFiles = sfc.getCommittedFiles();
committedFiles.put(storeName, storeCommittedFiles);
// Flush committed no files, indicating flush is empty or flush was canceled
if (storeCommittedFiles == null || storeCommittedFiles.isEmpty()) {
MemStoreSize storeFlushableSize = prepareResult.storeFlushableSize.get(storeName);
prepareResult.totalFlushableSize.decMemStoreSize(storeFlushableSize);
}
flushedOutputFileSize += sfc.getOutputFileSize();
}
storeFlushCtxs.clear();
// Set down the memstore size by amount of flush.
MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize();
this.decrMemStoreSize(mss);
// Increase the size of this Region for the purposes of quota. Noop if quotas are disabled.
// During startup, quota manager may not be initialized yet.
if (rsServices != null) {
RegionServerSpaceQuotaManager quotaManager = rsServices.getRegionServerSpaceQuotaManager();
if (quotaManager != null) {
quotaManager.getRegionSizeStore().incrementRegionSize(this.getRegionInfo(),
flushedOutputFileSize);
}
}
if (wal != null) {
// write flush marker to WAL. If fail, we should throw DroppedSnapshotException
FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH,
getRegionInfo(), flushOpSeqId, committedFiles);
WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true,
mvcc);
}
} catch (Throwable t) {
// An exception here means that the snapshot was not persisted.
// The wal needs to be replayed so its content is restored to memstore.
// Currently, only a server restart will do this.
// We used to only catch IOEs but its possible that we'd get other
// exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
// all and sundry.
if (wal != null) {
try {
FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
getRegionInfo(), flushOpSeqId, committedFiles);
WALUtil.writeFlushMarker(wal, this.replicationScope, getRegionInfo(), desc, false, mvcc);
} catch (Throwable ex) {
LOG.warn(
getRegionInfo().getEncodedName() + " : " + "failed writing ABORT_FLUSH marker to WAL",
ex);
// ignore this since we will be aborting the RS with DSE.
}
wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
}
DroppedSnapshotException dse = new DroppedSnapshotException(
"region: " + Bytes.toStringBinary(getRegionInfo().getRegionName()), t);
status.abort("Flush failed: " + StringUtils.stringifyException(t));
// Callers for flushcache() should catch DroppedSnapshotException and abort the region server.
// However, since we may have the region read lock, we cannot call close(true) here since
// we cannot promote to a write lock. Instead we are setting closing so that all other region
// operations except for close will be rejected.
this.closing.set(true);
if (rsServices != null) {
// This is a safeguard against the case where the caller fails to explicitly handle aborting
rsServices.abort("Replay of WAL required. Forcing server shutdown", dse);
}
throw dse;
}
// If we get to here, the HStores have been written.
if (wal != null) {
wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes(), flushedSeqId);
}
// Record latest flush time
for (HStore store : storesToFlush) {
this.lastStoreFlushTimeMap.put(store, startTime);
}
this.maxFlushedSeqId = flushedSeqId;
this.lastFlushOpSeqId = flushOpSeqId;
// C. Finally notify anyone waiting on memstore to clear:
// e.g. checkResources().
synchronized (this) {
notifyAll(); // FindBugs NN_NAKED_NOTIFY
}
long time = EnvironmentEdgeManager.currentTime() - startTime;
MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize();
long memstoresize = this.memStoreSizing.getMemStoreSize().getDataSize();
String msg = "Finished flush of" + " dataSize ~" + StringUtils.byteDesc(mss.getDataSize()) + "/"
+ mss.getDataSize() + ", heapSize ~" + StringUtils.byteDesc(mss.getHeapSize()) + "/"
+ mss.getHeapSize() + ", currentSize=" + StringUtils.byteDesc(memstoresize) + "/"
+ memstoresize + " for " + this.getRegionInfo().getEncodedName() + " in " + time
+ "ms, sequenceid=" + flushOpSeqId + ", compaction requested=" + compactionRequested
+ ((wal == null) ? "; wal=null" : "");
LOG.info(msg);
status.setStatus(msg);
if (rsServices != null && rsServices.getMetrics() != null) {
rsServices.getMetrics().updateFlush(getTableDescriptor().getTableName().getNameAsString(),
time, mss.getDataSize(), flushedOutputFileSize);
}
return new FlushResultImpl(compactionRequested
? FlushResult.Result.FLUSHED_COMPACTION_NEEDED
: FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushOpSeqId);
}
/**
* Method to safely get the next sequence number.
* @return Next sequence number unassociated with any actual edit.
*/
protected long getNextSequenceId(final WAL wal) throws IOException {
WriteEntry we = mvcc.begin();
mvcc.completeAndWait(we);
return we.getWriteNumber();
}
//////////////////////////////////////////////////////////////////////////////
// get() methods for client use.
//////////////////////////////////////////////////////////////////////////////
@Override
public RegionScannerImpl getScanner(Scan scan) throws IOException {
return getScanner(scan, null);
}
@Override
public RegionScannerImpl getScanner(Scan scan, List additionalScanners)
throws IOException {
return getScanner(scan, additionalScanners, HConstants.NO_NONCE, HConstants.NO_NONCE);
}
private RegionScannerImpl getScanner(Scan scan, List additionalScanners,
long nonceGroup, long nonce) throws IOException {
return TraceUtil.trace(() -> {
startRegionOperation(Operation.SCAN);
try {
// Verify families are all valid
if (!scan.hasFamilies()) {
// Adding all families to scanner
for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) {
scan.addFamily(family);
}
} else {
for (byte[] family : scan.getFamilyMap().keySet()) {
checkFamily(family);
}
}
return instantiateRegionScanner(scan, additionalScanners, nonceGroup, nonce);
} finally {
closeRegionOperation(Operation.SCAN);
}
}, () -> createRegionSpan("Region.getScanner"));
}
protected RegionScannerImpl instantiateRegionScanner(Scan scan,
List additionalScanners, long nonceGroup, long nonce) throws IOException {
if (scan.isReversed()) {
if (scan.getFilter() != null) {
scan.getFilter().setReversed(true);
}
return new ReversedRegionScannerImpl(scan, additionalScanners, this, nonceGroup, nonce);
}
return new RegionScannerImpl(scan, additionalScanners, this, nonceGroup, nonce);
}
/**
* Prepare a delete for a row mutation processor
* @param delete The passed delete is modified by this method. WARNING!
*/
private void prepareDelete(Delete delete) throws IOException {
// Check to see if this is a deleteRow insert
if (delete.getFamilyCellMap().isEmpty()) {
for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) {
// Don't eat the timestamp
delete.addFamily(family, delete.getTimestamp());
}
} else {
for (byte[] family : delete.getFamilyCellMap().keySet()) {
if (family == null) {
throw new NoSuchColumnFamilyException("Empty family is invalid");
}
checkFamily(family);
}
}
}
@Override
public void delete(Delete delete) throws IOException {
TraceUtil.trace(() -> {
checkReadOnly();
checkResources();
startRegionOperation(Operation.DELETE);
try {
// All edits for the given row (across all column families) must happen atomically.
return mutate(delete);
} finally {
closeRegionOperation(Operation.DELETE);
}
}, () -> createRegionSpan("Region.delete"));
}
/**
* Set up correct timestamps in the KVs in Delete object.
*
* Caller should have the row and region locks.
*/
private void prepareDeleteTimestamps(Mutation mutation, Map> familyMap,
byte[] byteNow) throws IOException {
for (Map.Entry> e : familyMap.entrySet()) {
byte[] family = e.getKey();
List cells = e.getValue();
assert cells instanceof RandomAccess;
Map kvCount = new TreeMap<>(Bytes.BYTES_COMPARATOR);
int listSize = cells.size();
for (int i = 0; i < listSize; i++) {
Cell cell = cells.get(i);
// Check if time is LATEST, change to time of most recent addition if so
// This is expensive.
if (
cell.getTimestamp() == HConstants.LATEST_TIMESTAMP && PrivateCellUtil.isDeleteType(cell)
) {
byte[] qual = CellUtil.cloneQualifier(cell);
Integer count = kvCount.get(qual);
if (count == null) {
kvCount.put(qual, 1);
} else {
kvCount.put(qual, count + 1);
}
count = kvCount.get(qual);
Get get = new Get(CellUtil.cloneRow(cell));
get.setMaxVersions(count);
get.addColumn(family, qual);
if (coprocessorHost != null) {
if (
!coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell, byteNow, get)
) {
updateDeleteLatestVersionTimestamp(cell, get, count, byteNow);
}
} else {
updateDeleteLatestVersionTimestamp(cell, get, count, byteNow);
}
} else {
PrivateCellUtil.updateLatestStamp(cell, byteNow);
}
}
}
}
private void updateDeleteLatestVersionTimestamp(Cell cell, Get get, int count, byte[] byteNow)
throws IOException {
try (RegionScanner scanner = getScanner(new Scan(get))) {
// NOTE: Please don't use HRegion.get() instead,
// because it will copy cells to heap. See HBASE-26036
List result = new ArrayList<>();
scanner.next(result);
if (result.size() < count) {
// Nothing to delete
PrivateCellUtil.updateLatestStamp(cell, byteNow);
return;
}
if (result.size() > count) {
throw new RuntimeException("Unexpected size: " + result.size());
}
Cell getCell = result.get(count - 1);
PrivateCellUtil.setTimestamp(cell, getCell.getTimestamp());
}
}
@Override
public void put(Put put) throws IOException {
TraceUtil.trace(() -> {
checkReadOnly();
// Do a rough check that we have resources to accept a write. The check is
// 'rough' in that between the resource check and the call to obtain a
// read lock, resources may run out. For now, the thought is that this
// will be extremely rare; we'll deal with it when it happens.
checkResources();
startRegionOperation(Operation.PUT);
try {
// All edits for the given row (across all column families) must happen atomically.
return mutate(put);
} finally {
closeRegionOperation(Operation.PUT);
}
}, () -> createRegionSpan("Region.put"));
}
/**
* Class that tracks the progress of a batch operations, accumulating status codes and tracking
* the index at which processing is proceeding. These batch operations may get split into
* mini-batches for processing.
*/
private abstract static class BatchOperation {
protected final T[] operations;
protected final OperationStatus[] retCodeDetails;
protected final WALEdit[] walEditsFromCoprocessors;
// reference family cell maps directly so coprocessors can mutate them if desired
protected final Map>[] familyCellMaps;
// For Increment/Append operations
protected final Result[] results;
protected final HRegion region;
protected int nextIndexToProcess = 0;
protected final ObservedExceptionsInBatch observedExceptions;
// Durability of the batch (highest durability of all operations)
protected Durability durability;
protected boolean atomic = false;
public BatchOperation(final HRegion region, T[] operations) {
this.operations = operations;
this.retCodeDetails = new OperationStatus[operations.length];
Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN);
this.walEditsFromCoprocessors = new WALEdit[operations.length];
familyCellMaps = new Map[operations.length];
this.results = new Result[operations.length];
this.region = region;
observedExceptions = new ObservedExceptionsInBatch();
durability = Durability.USE_DEFAULT;
}
/**
* Visitor interface for batch operations
*/
@FunctionalInterface
interface Visitor {
/**
* @param index operation index
* @return If true continue visiting remaining entries, break otherwise
*/
boolean visit(int index) throws IOException;
}
/**
* Helper method for visiting pending/ all batch operations
*/
public void visitBatchOperations(boolean pendingOnly, int lastIndexExclusive, Visitor visitor)
throws IOException {
assert lastIndexExclusive <= this.size();
for (int i = nextIndexToProcess; i < lastIndexExclusive; i++) {
if (!pendingOnly || isOperationPending(i)) {
if (!visitor.visit(i)) {
break;
}
}
}
}
public abstract Mutation getMutation(int index);
public abstract long getNonceGroup(int index);
public abstract long getNonce(int index);
/**
* This method is potentially expensive and useful mostly for non-replay CP path.
*/
public abstract Mutation[] getMutationsForCoprocs();
public abstract boolean isInReplay();
public abstract long getOrigLogSeqNum();
public abstract void startRegionOperation() throws IOException;
public abstract void closeRegionOperation() throws IOException;
/**
* Validates each mutation and prepares a batch for write. If necessary (non-replay case), runs
* CP prePut()/preDelete()/preIncrement()/preAppend() hooks for all mutations in a batch. This
* is intended to operate on entire batch and will be called from outside of class to check and
* prepare batch. This can be implemented by calling helper method
* {@link #checkAndPrepareMutation(int, long)} in a 'for' loop over mutations.
*/
public abstract void checkAndPrepare() throws IOException;
/**
* Implement any Put request specific check and prepare logic here. Please refer to
* {@link #checkAndPrepareMutation(Mutation, long)} for how its used.
*/
protected abstract void checkAndPreparePut(final Put p) throws IOException;
/**
* If necessary, calls preBatchMutate() CP hook for a mini-batch and updates metrics, cell
* count, tags and timestamp for all cells of all operations in a mini-batch.
*/
public abstract void prepareMiniBatchOperations(
MiniBatchOperationInProgress miniBatchOp, long timestamp,
final List acquiredRowLocks) throws IOException;
/**
* Write mini-batch operations to MemStore
*/
public abstract WriteEntry writeMiniBatchOperationsToMemStore(
final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry)
throws IOException;
protected void writeMiniBatchOperationsToMemStore(
final MiniBatchOperationInProgress miniBatchOp, final long writeNumber)
throws IOException {
MemStoreSizing memStoreAccounting = new NonThreadSafeMemStoreSizing();
visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> {
// We need to update the sequence id for following reasons.
// 1) If the op is in replay mode, FSWALEntry#stampRegionSequenceId won't stamp sequence id.
// 2) If no WAL, FSWALEntry won't be used
// we use durability of the original mutation for the mutation passed by CP.
if (isInReplay() || getMutation(index).getDurability() == Durability.SKIP_WAL) {
region.updateSequenceId(familyCellMaps[index].values(), writeNumber);
}
applyFamilyMapToMemStore(familyCellMaps[index], memStoreAccounting);
return true;
});
// update memStore size
region.incMemStoreSize(memStoreAccounting.getDataSize(), memStoreAccounting.getHeapSize(),
memStoreAccounting.getOffHeapSize(), memStoreAccounting.getCellsCount());
}
public boolean isDone() {
return nextIndexToProcess == operations.length;
}
public int size() {
return operations.length;
}
public boolean isOperationPending(int index) {
return retCodeDetails[index].getOperationStatusCode() == OperationStatusCode.NOT_RUN;
}
public List getClusterIds() {
assert size() != 0;
return getMutation(0).getClusterIds();
}
boolean isAtomic() {
return atomic;
}
/**
* Helper method that checks and prepares only one mutation. This can be used to implement
* {@link #checkAndPrepare()} for entire Batch. NOTE: As CP
* prePut()/preDelete()/preIncrement()/preAppend() hooks may modify mutations, this method
* should be called after prePut()/preDelete()/preIncrement()/preAppend() CP hooks are run for
* the mutation
*/
protected void checkAndPrepareMutation(Mutation mutation, final long timestamp)
throws IOException {
region.checkRow(mutation.getRow(), "batchMutate");
if (mutation instanceof Put) {
// Check the families in the put. If bad, skip this one.
checkAndPreparePut((Put) mutation);
region.checkTimestamps(mutation.getFamilyCellMap(), timestamp);
} else if (mutation instanceof Delete) {
region.prepareDelete((Delete) mutation);
} else if (mutation instanceof Increment || mutation instanceof Append) {
region.checkFamilies(mutation.getFamilyCellMap().keySet());
}
}
protected void checkAndPrepareMutation(int index, long timestamp) throws IOException {
Mutation mutation = getMutation(index);
try {
this.checkAndPrepareMutation(mutation, timestamp);
if (mutation instanceof Put || mutation instanceof Delete) {
// store the family map reference to allow for mutations
familyCellMaps[index] = mutation.getFamilyCellMap();
}
// store durability for the batch (highest durability of all operations in the batch)
Durability tmpDur = region.getEffectiveDurability(mutation.getDurability());
if (tmpDur.ordinal() > durability.ordinal()) {
durability = tmpDur;
}
} catch (NoSuchColumnFamilyException nscfe) {
final String msg = "No such column family in batch mutation in region " + this;
if (observedExceptions.hasSeenNoSuchFamily()) {
LOG.warn(msg + nscfe.getMessage());
} else {
LOG.warn(msg, nscfe);
observedExceptions.sawNoSuchFamily();
}
retCodeDetails[index] =
new OperationStatus(OperationStatusCode.BAD_FAMILY, nscfe.getMessage());
if (isAtomic()) { // fail, atomic means all or none
throw nscfe;
}
} catch (FailedSanityCheckException fsce) {
final String msg = "Batch Mutation did not pass sanity check in region " + this;
if (observedExceptions.hasSeenFailedSanityCheck()) {
LOG.warn(msg + fsce.getMessage());
} else {
LOG.warn(msg, fsce);
observedExceptions.sawFailedSanityCheck();
}
retCodeDetails[index] =
new OperationStatus(OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
if (isAtomic()) {
throw fsce;
}
} catch (WrongRegionException we) {
final String msg = "Batch mutation had a row that does not belong to this region " + this;
if (observedExceptions.hasSeenWrongRegion()) {
LOG.warn(msg + we.getMessage());
} else {
LOG.warn(msg, we);
observedExceptions.sawWrongRegion();
}
retCodeDetails[index] =
new OperationStatus(OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage());
if (isAtomic()) {
throw we;
}
}
}
/**
* Creates Mini-batch of all operations [nextIndexToProcess, lastIndexExclusive) for which a row
* lock can be acquired. All mutations with locked rows are considered to be In-progress
* operations and hence the name {@link MiniBatchOperationInProgress}. Mini batch is window over
* {@link BatchOperation} and contains contiguous pending operations.
* @param acquiredRowLocks keeps track of rowLocks acquired.
*/
public MiniBatchOperationInProgress
lockRowsAndBuildMiniBatch(List acquiredRowLocks) throws IOException {
int readyToWriteCount = 0;
int lastIndexExclusive = 0;
RowLock prevRowLock = null;
for (; lastIndexExclusive < size(); lastIndexExclusive++) {
// It reaches the miniBatchSize, stop here and process the miniBatch
// This only applies to non-atomic batch operations.
if (!isAtomic() && (readyToWriteCount == region.miniBatchSize)) {
break;
}
if (!isOperationPending(lastIndexExclusive)) {
continue;
}
// HBASE-19389 Limit concurrency of put with dense (hundreds) columns to avoid exhausting
// RS handlers, covering both MutationBatchOperation and ReplayBatchOperation
// The BAD_FAMILY/SANITY_CHECK_FAILURE cases are handled in checkAndPrepare phase and won't
// pass the isOperationPending check
Map> curFamilyCellMap =
getMutation(lastIndexExclusive).getFamilyCellMap();
try {
// start the protector before acquiring row lock considering performance, and will finish
// it when encountering exception
region.storeHotnessProtector.start(curFamilyCellMap);
} catch (RegionTooBusyException rtbe) {
region.storeHotnessProtector.finish(curFamilyCellMap);
if (isAtomic()) {
throw rtbe;
}
retCodeDetails[lastIndexExclusive] =
new OperationStatus(OperationStatusCode.STORE_TOO_BUSY, rtbe.getMessage());
continue;
}
Mutation mutation = getMutation(lastIndexExclusive);
// If we haven't got any rows in our batch, we should block to get the next one.
RowLock rowLock = null;
boolean throwException = false;
try {
// if atomic then get exclusive lock, else shared lock
rowLock = region.getRowLock(mutation.getRow(), !isAtomic(), prevRowLock);
} catch (TimeoutIOException | InterruptedIOException e) {
// NOTE: We will retry when other exceptions, but we should stop if we receive
// TimeoutIOException or InterruptedIOException as operation has timed out or
// interrupted respectively.
throwException = true;
throw e;
} catch (IOException ioe) {
LOG.warn("Failed getting lock, row={}, in region {}",
Bytes.toStringBinary(mutation.getRow()), this, ioe);
if (isAtomic()) { // fail, atomic means all or none
throwException = true;
throw ioe;
}
} catch (Throwable throwable) {
throwException = true;
throw throwable;
} finally {
if (throwException) {
region.storeHotnessProtector.finish(curFamilyCellMap);
}
}
if (rowLock == null) {
// We failed to grab another lock
if (isAtomic()) {
region.storeHotnessProtector.finish(curFamilyCellMap);
throw new IOException("Can't apply all operations atomically!");
}
break; // Stop acquiring more rows for this batch
} else {
if (rowLock != prevRowLock) {
// It is a different row now, add this to the acquiredRowLocks and
// set prevRowLock to the new returned rowLock
acquiredRowLocks.add(rowLock);
prevRowLock = rowLock;
}
}
readyToWriteCount++;
}
return createMiniBatch(lastIndexExclusive, readyToWriteCount);
}
protected MiniBatchOperationInProgress createMiniBatch(final int lastIndexExclusive,
final int readyToWriteCount) {
return new MiniBatchOperationInProgress<>(getMutationsForCoprocs(), retCodeDetails,
walEditsFromCoprocessors, nextIndexToProcess, lastIndexExclusive, readyToWriteCount);
}
/**
* Builds separate WALEdit per nonce by applying input mutations. If WALEdits from CP are
* present, they are merged to result WALEdit.
*/
public List>
buildWALEdits(final MiniBatchOperationInProgress miniBatchOp) throws IOException {
List> walEdits = new ArrayList<>();
visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), new Visitor() {
private Pair curWALEditForNonce;
@Override
public boolean visit(int index) throws IOException {
Mutation m = getMutation(index);
// we use durability of the original mutation for the mutation passed by CP.
if (region.getEffectiveDurability(m.getDurability()) == Durability.SKIP_WAL) {
region.recordMutationWithoutWal(m.getFamilyCellMap());
return true;
}
// the batch may contain multiple nonce keys (replay case). If so, write WALEdit for each.
// Given how nonce keys are originally written, these should be contiguous.
// They don't have to be, it will still work, just write more WALEdits than needed.
long nonceGroup = getNonceGroup(index);
long nonce = getNonce(index);
if (
curWALEditForNonce == null
|| curWALEditForNonce.getFirst().getNonceGroup() != nonceGroup
|| curWALEditForNonce.getFirst().getNonce() != nonce
) {
curWALEditForNonce = new Pair<>(new NonceKey(nonceGroup, nonce),
new WALEdit(miniBatchOp.getCellCount(), isInReplay()));
walEdits.add(curWALEditForNonce);
}
WALEdit walEdit = curWALEditForNonce.getSecond();
// Add WAL edits from CPs.
WALEdit fromCP = walEditsFromCoprocessors[index];
if (fromCP != null) {
for (Cell cell : fromCP.getCells()) {
walEdit.add(cell);
}
}
walEdit.add(familyCellMaps[index]);
return true;
}
});
return walEdits;
}
/**
* This method completes mini-batch operations by calling postBatchMutate() CP hook (if
* required) and completing mvcc.
*/
public void completeMiniBatchOperations(
final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry)
throws IOException {
if (writeEntry != null) {
region.mvcc.completeAndWait(writeEntry);
}
}
public void doPostOpCleanupForMiniBatch(
final MiniBatchOperationInProgress miniBatchOp, final WALEdit walEdit,
boolean success) throws IOException {
doFinishHotnessProtector(miniBatchOp);
}
private void
doFinishHotnessProtector(final MiniBatchOperationInProgress miniBatchOp) {
// check and return if the protector is not enabled
if (!region.storeHotnessProtector.isEnable()) {
return;
}
// miniBatchOp is null, if and only if lockRowsAndBuildMiniBatch throwing exception.
// This case was handled.
if (miniBatchOp == null) {
return;
}
final int finalLastIndexExclusive = miniBatchOp.getLastIndexExclusive();
for (int i = nextIndexToProcess; i < finalLastIndexExclusive; i++) {
switch (retCodeDetails[i].getOperationStatusCode()) {
case SUCCESS:
case FAILURE:
region.storeHotnessProtector.finish(getMutation(i).getFamilyCellMap());
break;
default:
// do nothing
// We won't start the protector for NOT_RUN/BAD_FAMILY/SANITY_CHECK_FAILURE and the
// STORE_TOO_BUSY case is handled in StoreHotnessProtector#start
break;
}
}
}
/**
* Atomically apply the given map of family->edits to the memstore. This handles the consistency
* control on its own, but the caller should already have locked updatesLock.readLock(). This
* also does not check the families for validity.
* @param familyMap Map of Cells by family
*/
protected void applyFamilyMapToMemStore(Map> familyMap,
MemStoreSizing memstoreAccounting) throws IOException {
for (Map.Entry> e : familyMap.entrySet()) {
byte[] family = e.getKey();
List cells = e.getValue();
assert cells instanceof RandomAccess;
region.applyToMemStore(region.getStore(family), cells, false, memstoreAccounting);
}
}
}
/**
* Batch of mutation operations. Base class is shared with {@link ReplayBatchOperation} as most of
* the logic is same.
*/
private static class MutationBatchOperation extends BatchOperation {
// For nonce operations
private long nonceGroup;
private long nonce;
protected boolean canProceed;
public MutationBatchOperation(final HRegion region, Mutation[] operations, boolean atomic,
long nonceGroup, long nonce) {
super(region, operations);
this.atomic = atomic;
this.nonceGroup = nonceGroup;
this.nonce = nonce;
}
@Override
public Mutation getMutation(int index) {
return this.operations[index];
}
@Override
public long getNonceGroup(int index) {
return nonceGroup;
}
@Override
public long getNonce(int index) {
return nonce;
}
@Override
public Mutation[] getMutationsForCoprocs() {
return this.operations;
}
@Override
public boolean isInReplay() {
return false;
}
@Override
public long getOrigLogSeqNum() {
return SequenceId.NO_SEQUENCE_ID;
}
@Override
public void startRegionOperation() throws IOException {
region.startRegionOperation(Operation.BATCH_MUTATE);
}
@Override
public void closeRegionOperation() throws IOException {
region.closeRegionOperation(Operation.BATCH_MUTATE);
}
@Override
public void checkAndPreparePut(Put p) throws IOException {
region.checkFamilies(p.getFamilyCellMap().keySet());
}
@Override
public void checkAndPrepare() throws IOException {
// index 0: puts, index 1: deletes, index 2: increments, index 3: append
final int[] metrics = { 0, 0, 0, 0 };
visitBatchOperations(true, this.size(), new Visitor() {
private long now = EnvironmentEdgeManager.currentTime();
private WALEdit walEdit;
@Override
public boolean visit(int index) throws IOException {
// Run coprocessor pre hook outside of locks to avoid deadlock
if (region.coprocessorHost != null) {
if (walEdit == null) {
walEdit = new WALEdit();
}
callPreMutateCPHook(index, walEdit, metrics);
if (!walEdit.isEmpty()) {
walEditsFromCoprocessors[index] = walEdit;
walEdit = null;
}
}
if (isOperationPending(index)) {
// TODO: Currently validation is done with current time before acquiring locks and
// updates are done with different timestamps after acquiring locks. This behavior is
// inherited from the code prior to this change. Can this be changed?
checkAndPrepareMutation(index, now);
}
return true;
}
});
// FIXME: we may update metrics twice! here for all operations bypassed by CP and later in
// normal processing.
// Update metrics in same way as it is done when we go the normal processing route (we now
// update general metrics though a Coprocessor did the work).
if (region.metricsRegion != null) {
if (metrics[0] > 0) {
// There were some Puts in the batch.
region.metricsRegion.updatePut();
}
if (metrics[1] > 0) {
// There were some Deletes in the batch.
region.metricsRegion.updateDelete();
}
if (metrics[2] > 0) {
// There were some Increment in the batch.
region.metricsRegion.updateIncrement();
}
if (metrics[3] > 0) {
// There were some Append in the batch.
region.metricsRegion.updateAppend();
}
}
}
@Override
public void prepareMiniBatchOperations(MiniBatchOperationInProgress miniBatchOp,
long timestamp, final List acquiredRowLocks) throws IOException {
// For nonce operations
canProceed = startNonceOperation();
visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> {
Mutation mutation = getMutation(index);
if (mutation instanceof Put) {
HRegion.updateCellTimestamps(familyCellMaps[index].values(), Bytes.toBytes(timestamp));
miniBatchOp.incrementNumOfPuts();
} else if (mutation instanceof Delete) {
region.prepareDeleteTimestamps(mutation, familyCellMaps[index], Bytes.toBytes(timestamp));
miniBatchOp.incrementNumOfDeletes();
} else if (mutation instanceof Increment || mutation instanceof Append) {
boolean returnResults;
if (mutation instanceof Increment) {
returnResults = ((Increment) mutation).isReturnResults();
} else {
returnResults = ((Append) mutation).isReturnResults();
}
// For nonce operations
if (!canProceed) {
Result result;
if (returnResults) {
// convert duplicate increment/append to get
List results = region.get(toGet(mutation), false, nonceGroup, nonce);
result = Result.create(results);
} else {
result = Result.EMPTY_RESULT;
}
retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result);
return true;
}
Result result = null;
if (region.coprocessorHost != null) {
if (mutation instanceof Increment) {
result = region.coprocessorHost.preIncrementAfterRowLock((Increment) mutation);
} else {
result = region.coprocessorHost.preAppendAfterRowLock((Append) mutation);
}
}
if (result != null) {
retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS,
returnResults ? result : Result.EMPTY_RESULT);
return true;
}
List results = returnResults ? new ArrayList<>(mutation.size()) : null;
familyCellMaps[index] = reckonDeltas(mutation, results, timestamp);
this.results[index] = results != null ? Result.create(results) : Result.EMPTY_RESULT;
if (mutation instanceof Increment) {
miniBatchOp.incrementNumOfIncrements();
} else {
miniBatchOp.incrementNumOfAppends();
}
}
region.rewriteCellTags(familyCellMaps[index], mutation);
// update cell count
if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) {
for (List cells : mutation.getFamilyCellMap().values()) {
miniBatchOp.addCellCount(cells.size());
}
}
WALEdit fromCP = walEditsFromCoprocessors[index];
if (fromCP != null) {
miniBatchOp.addCellCount(fromCP.size());
}
return true;
});
if (region.coprocessorHost != null) {
// calling the pre CP hook for batch mutation
region.coprocessorHost.preBatchMutate(miniBatchOp);
checkAndMergeCPMutations(miniBatchOp, acquiredRowLocks, timestamp);
}
}
/**
* Starts the nonce operation for a mutation, if needed.
* @return whether to proceed this mutation.
*/
private boolean startNonceOperation() throws IOException {
if (
region.rsServices == null || region.rsServices.getNonceManager() == null
|| nonce == HConstants.NO_NONCE
) {
return true;
}
boolean canProceed;
try {
canProceed =
region.rsServices.getNonceManager().startOperation(nonceGroup, nonce, region.rsServices);
} catch (InterruptedException ex) {
throw new InterruptedIOException("Nonce start operation interrupted");
}
return canProceed;
}
/**
* Ends nonce operation for a mutation, if needed.
* @param success Whether the operation for this nonce has succeeded.
*/
private void endNonceOperation(boolean success) {
if (
region.rsServices != null && region.rsServices.getNonceManager() != null
&& nonce != HConstants.NO_NONCE
) {
region.rsServices.getNonceManager().endOperation(nonceGroup, nonce, success);
}
}
private static Get toGet(final Mutation mutation) throws IOException {
assert mutation instanceof Increment || mutation instanceof Append;
Get get = new Get(mutation.getRow());
CellScanner cellScanner = mutation.cellScanner();
while (!cellScanner.advance()) {
Cell cell = cellScanner.current();
get.addColumn(CellUtil.cloneFamily(cell), CellUtil.cloneQualifier(cell));
}
if (mutation instanceof Increment) {
// Increment
Increment increment = (Increment) mutation;
get.setTimeRange(increment.getTimeRange().getMin(), increment.getTimeRange().getMax());
} else {
// Append
Append append = (Append) mutation;
get.setTimeRange(append.getTimeRange().getMin(), append.getTimeRange().getMax());
}
for (Entry entry : mutation.getAttributesMap().entrySet()) {
get.setAttribute(entry.getKey(), entry.getValue());
}
return get;
}
private Map> reckonDeltas(Mutation mutation, List results, long now)
throws IOException {
assert mutation instanceof Increment || mutation instanceof Append;
Map> ret = new TreeMap<>(Bytes.BYTES_COMPARATOR);
// Process a Store/family at a time.
for (Map.Entry> entry : mutation.getFamilyCellMap().entrySet()) {
final byte[] columnFamilyName = entry.getKey();
List deltas = entry.getValue();
// Reckon for the Store what to apply to WAL and MemStore.
List toApply =
reckonDeltasByStore(region.stores.get(columnFamilyName), mutation, now, deltas, results);
if (!toApply.isEmpty()) {
for (Cell cell : toApply) {
HStore store = region.getStore(cell);
if (store == null) {
region.checkFamily(CellUtil.cloneFamily(cell));
} else {
ret.computeIfAbsent(store.getColumnFamilyDescriptor().getName(),
key -> new ArrayList<>()).add(cell);
}
}
}
}
return ret;
}
/**
* Reckon the Cells to apply to WAL, memstore, and to return to the Client in passed column
* family/Store. Does Get of current value and then adds passed in deltas for this Store
* returning the result.
* @param mutation The encompassing Mutation object
* @param deltas Changes to apply to this Store; either increment amount or data to append
* @param results In here we accumulate all the Cells we are to return to the client. If null,
* client doesn't want results returned.
* @return Resulting Cells after deltas have been applied to current values. Side
* effect is our filling out of the results List.
*/
private List reckonDeltasByStore(HStore store, Mutation mutation, long now,
List deltas, List results) throws IOException {
assert mutation instanceof Increment || mutation instanceof Append;
byte[] columnFamily = store.getColumnFamilyDescriptor().getName();
List> cellPairs = new ArrayList<>(deltas.size());
// Sort the cells so that they match the order that they appear in the Get results.
// Otherwise, we won't be able to find the existing values if the cells are not specified
// in order by the client since cells are in an array list.
deltas.sort(store.getComparator());
// Get previous values for all columns in this family.
Get get = new Get(mutation.getRow());
for (Cell cell : deltas) {
get.addColumn(columnFamily, CellUtil.cloneQualifier(cell));
}
TimeRange tr;
if (mutation instanceof Increment) {
tr = ((Increment) mutation).getTimeRange();
} else {
tr = ((Append) mutation).getTimeRange();
}
if (tr != null) {
get.setTimeRange(tr.getMin(), tr.getMax());
}
try (RegionScanner scanner = region.getScanner(new Scan(get))) {
// NOTE: Please don't use HRegion.get() instead,
// because it will copy cells to heap. See HBASE-26036
List currentValues = new ArrayList<>();
scanner.next(currentValues);
// Iterate the input columns and update existing values if they were found, otherwise
// add new column initialized to the delta amount
int currentValuesIndex = 0;
for (int i = 0; i < deltas.size(); i++) {
Cell delta = deltas.get(i);
Cell currentValue = null;
if (
currentValuesIndex < currentValues.size()
&& CellUtil.matchingQualifier(currentValues.get(currentValuesIndex), delta)
) {
currentValue = currentValues.get(currentValuesIndex);
if (i < (deltas.size() - 1) && !CellUtil.matchingQualifier(delta, deltas.get(i + 1))) {
currentValuesIndex++;
}
}
// Switch on whether this an increment or an append building the new Cell to apply.
Cell newCell;
if (mutation instanceof Increment) {
long deltaAmount = getLongValue(delta);
final long newValue =
currentValue == null ? deltaAmount : getLongValue(currentValue) + deltaAmount;
newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation,
(oldCell) -> Bytes.toBytes(newValue));
} else {
newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation,
(oldCell) -> ByteBuffer
.wrap(new byte[delta.getValueLength() + oldCell.getValueLength()])
.put(oldCell.getValueArray(), oldCell.getValueOffset(), oldCell.getValueLength())
.put(delta.getValueArray(), delta.getValueOffset(), delta.getValueLength())
.array());
}
if (region.maxCellSize > 0) {
int newCellSize = PrivateCellUtil.estimatedSerializedSizeOf(newCell);
if (newCellSize > region.maxCellSize) {
String msg = "Cell with size " + newCellSize + " exceeds limit of "
+ region.maxCellSize + " bytes in region " + this;
LOG.debug(msg);
throw new DoNotRetryIOException(msg);
}
}
cellPairs.add(new Pair<>(currentValue, newCell));
// Add to results to get returned to the Client. If null, cilent does not want results.
if (results != null) {
results.add(newCell);
}
}
// Give coprocessors a chance to update the new cells before apply to WAL or memstore
if (region.coprocessorHost != null) {
// Here the operation must be increment or append.
cellPairs = mutation instanceof Increment
? region.coprocessorHost.postIncrementBeforeWAL(mutation, cellPairs)
: region.coprocessorHost.postAppendBeforeWAL(mutation, cellPairs);
}
}
return cellPairs.stream().map(Pair::getSecond).collect(Collectors.toList());
}
private static Cell reckonDelta(final Cell delta, final Cell currentCell,
final byte[] columnFamily, final long now, Mutation mutation, Function supplier)
throws IOException {
// Forward any tags found on the delta.
List tags = TagUtil.carryForwardTags(delta);
if (currentCell != null) {
tags = TagUtil.carryForwardTags(tags, currentCell);
tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL());
byte[] newValue = supplier.apply(currentCell);
return ExtendedCellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
.setRow(mutation.getRow(), 0, mutation.getRow().length)
.setFamily(columnFamily, 0, columnFamily.length)
// copy the qualifier if the cell is located in shared memory.
.setQualifier(CellUtil.cloneQualifier(delta))
.setTimestamp(Math.max(currentCell.getTimestamp() + 1, now))
.setType(KeyValue.Type.Put.getCode()).setValue(newValue, 0, newValue.length)
.setTags(TagUtil.fromList(tags)).build();
} else {
tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL());
PrivateCellUtil.updateLatestStamp(delta, now);
return CollectionUtils.isEmpty(tags) ? delta : PrivateCellUtil.createCell(delta, tags);
}
}
/** Returns Get the long out of the passed in Cell */
private static long getLongValue(final Cell cell) throws DoNotRetryIOException {
int len = cell.getValueLength();
if (len != Bytes.SIZEOF_LONG) {
// throw DoNotRetryIOException instead of IllegalArgumentException
throw new DoNotRetryIOException("Field is not a long, it's " + len + " bytes wide");
}
return PrivateCellUtil.getValueAsLong(cell);
}
@Override
public List>
buildWALEdits(final MiniBatchOperationInProgress miniBatchOp) throws IOException {
List> walEdits = super.buildWALEdits(miniBatchOp);
// for MutationBatchOperation, more than one nonce is not allowed
if (walEdits.size() > 1) {
throw new IOException("Found multiple nonce keys per batch!");
}
return walEdits;
}
@Override
public WriteEntry writeMiniBatchOperationsToMemStore(
final MiniBatchOperationInProgress miniBatchOp, @Nullable WriteEntry writeEntry)
throws IOException {
if (writeEntry == null) {
writeEntry = region.mvcc.begin();
}
super.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry.getWriteNumber());
return writeEntry;
}
@Override
public void completeMiniBatchOperations(
final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry)
throws IOException {
// TODO: can it be done after completing mvcc?
// calling the post CP hook for batch mutation
if (region.coprocessorHost != null) {
region.coprocessorHost.postBatchMutate(miniBatchOp);
}
super.completeMiniBatchOperations(miniBatchOp, writeEntry);
if (nonce != HConstants.NO_NONCE) {
if (region.rsServices != null && region.rsServices.getNonceManager() != null) {
region.rsServices.getNonceManager().addMvccToOperationContext(nonceGroup, nonce,
writeEntry.getWriteNumber());
}
}
}
@Override
public void doPostOpCleanupForMiniBatch(MiniBatchOperationInProgress miniBatchOp,
final WALEdit walEdit, boolean success) throws IOException {
super.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, success);
if (miniBatchOp != null) {
// synced so that the coprocessor contract is adhered to.
if (region.coprocessorHost != null) {
visitBatchOperations(false, miniBatchOp.getLastIndexExclusive(), (int i) -> {
// only for successful puts/deletes/increments/appends
if (retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.SUCCESS) {
Mutation m = getMutation(i);
if (m instanceof Put) {
region.coprocessorHost.postPut((Put) m, walEdit);
} else if (m instanceof Delete) {
region.coprocessorHost.postDelete((Delete) m, walEdit);
} else if (m instanceof Increment) {
Result result =
region.getCoprocessorHost().postIncrement((Increment) m, results[i], walEdit);
if (result != results[i]) {
retCodeDetails[i] =
new OperationStatus(retCodeDetails[i].getOperationStatusCode(), result);
}
} else if (m instanceof Append) {
Result result =
region.getCoprocessorHost().postAppend((Append) m, results[i], walEdit);
if (result != results[i]) {
retCodeDetails[i] =
new OperationStatus(retCodeDetails[i].getOperationStatusCode(), result);
}
}
}
return true;
});
}
// For nonce operations
if (canProceed && nonce != HConstants.NO_NONCE) {
boolean[] areAllIncrementsAndAppendsSuccessful = new boolean[] { true };
visitBatchOperations(false, miniBatchOp.getLastIndexExclusive(), (int i) -> {
Mutation mutation = getMutation(i);
if (mutation instanceof Increment || mutation instanceof Append) {
if (retCodeDetails[i].getOperationStatusCode() != OperationStatusCode.SUCCESS) {
areAllIncrementsAndAppendsSuccessful[0] = false;
return false;
}
}
return true;
});
endNonceOperation(areAllIncrementsAndAppendsSuccessful[0]);
}
// See if the column families were consistent through the whole thing.
// if they were then keep them. If they were not then pass a null.
// null will be treated as unknown.
// Total time taken might be involving Puts, Deletes, Increments and Appends.
// Split the time for puts and deletes based on the total number of Puts, Deletes,
// Increments and Appends.
if (region.metricsRegion != null) {
if (miniBatchOp.getNumOfPuts() > 0) {
// There were some Puts in the batch.
region.metricsRegion.updatePut();
}
if (miniBatchOp.getNumOfDeletes() > 0) {
// There were some Deletes in the batch.
region.metricsRegion.updateDelete();
}
if (miniBatchOp.getNumOfIncrements() > 0) {
// There were some Increments in the batch.
region.metricsRegion.updateIncrement();
}
if (miniBatchOp.getNumOfAppends() > 0) {
// There were some Appends in the batch.
region.metricsRegion.updateAppend();
}
}
}
if (region.coprocessorHost != null) {
// call the coprocessor hook to do any finalization steps after the put is done
region.coprocessorHost.postBatchMutateIndispensably(
miniBatchOp != null ? miniBatchOp : createMiniBatch(size(), 0), success);
}
}
/**
* Runs prePut/preDelete/preIncrement/preAppend coprocessor hook for input mutation in a batch
* @param metrics Array of 2 ints. index 0: count of puts, index 1: count of deletes, index 2:
* count of increments and 3: count of appends
*/
private void callPreMutateCPHook(int index, final WALEdit walEdit, final int[] metrics)
throws IOException {
Mutation m = getMutation(index);
if (m instanceof Put) {
if (region.coprocessorHost.prePut((Put) m, walEdit)) {
// pre hook says skip this Put
// mark as success and skip in doMiniBatchMutation
metrics[0]++;
retCodeDetails[index] = OperationStatus.SUCCESS;
}
} else if (m instanceof Delete) {
Delete curDel = (Delete) m;
if (curDel.getFamilyCellMap().isEmpty()) {
// handle deleting a row case
// TODO: prepareDelete() has been called twice, before and after preDelete() CP hook.
// Can this be avoided?
region.prepareDelete(curDel);
}
if (region.coprocessorHost.preDelete(curDel, walEdit)) {
// pre hook says skip this Delete
// mark as success and skip in doMiniBatchMutation
metrics[1]++;
retCodeDetails[index] = OperationStatus.SUCCESS;
}
} else if (m instanceof Increment) {
Increment increment = (Increment) m;
Result result = region.coprocessorHost.preIncrement(increment, walEdit);
if (result != null) {
// pre hook says skip this Increment
// mark as success and skip in doMiniBatchMutation
metrics[2]++;
retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result);
}
} else if (m instanceof Append) {
Append append = (Append) m;
Result result = region.coprocessorHost.preAppend(append, walEdit);
if (result != null) {
// pre hook says skip this Append
// mark as success and skip in doMiniBatchMutation
metrics[3]++;
retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result);
}
} else {
String msg = "Put/Delete/Increment/Append mutations only supported in a batch";
retCodeDetails[index] = new OperationStatus(OperationStatusCode.FAILURE, msg);
if (isAtomic()) { // fail, atomic means all or none
throw new IOException(msg);
}
}
}
// TODO Support Increment/Append operations
private void checkAndMergeCPMutations(final MiniBatchOperationInProgress miniBatchOp,
final List acquiredRowLocks, final long timestamp) throws IOException {
visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), (int i) -> {
// we pass (i - firstIndex) below since the call expects a relative index
Mutation[] cpMutations = miniBatchOp.getOperationsFromCoprocessors(i - nextIndexToProcess);
if (cpMutations == null) {
return true;
}
// Else Coprocessor added more Mutations corresponding to the Mutation at this index.
Mutation mutation = getMutation(i);
for (Mutation cpMutation : cpMutations) {
this.checkAndPrepareMutation(cpMutation, timestamp);
// Acquire row locks. If not, the whole batch will fail.
acquiredRowLocks.add(region.getRowLock(cpMutation.getRow(), true, null));
// Returned mutations from coprocessor correspond to the Mutation at index i. We can
// directly add the cells from those mutations to the familyMaps of this mutation.
Map> cpFamilyMap = cpMutation.getFamilyCellMap();
region.rewriteCellTags(cpFamilyMap, mutation);
// will get added to the memStore later
mergeFamilyMaps(familyCellMaps[i], cpFamilyMap);
// The durability of returned mutation is replaced by the corresponding mutation.
// If the corresponding mutation contains the SKIP_WAL, we shouldn't count the
// cells of returned mutation.
if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) {
for (List cells : cpFamilyMap.values()) {
miniBatchOp.addCellCount(cells.size());
}
}
}
return true;
});
}
private void mergeFamilyMaps(Map> familyMap,
Map> toBeMerged) {
for (Map.Entry> entry : toBeMerged.entrySet()) {
List cells = familyMap.get(entry.getKey());
if (cells == null) {
familyMap.put(entry.getKey(), entry.getValue());
} else {
cells.addAll(entry.getValue());
}
}
}
}
/**
* Batch of mutations for replay. Base class is shared with {@link MutationBatchOperation} as most
* of the logic is same.
*/
private static final class ReplayBatchOperation extends BatchOperation {
private long origLogSeqNum = 0;
public ReplayBatchOperation(final HRegion region, MutationReplay[] operations,
long origLogSeqNum) {
super(region, operations);
this.origLogSeqNum = origLogSeqNum;
}
@Override
public Mutation getMutation(int index) {
return this.operations[index].mutation;
}
@Override
public long getNonceGroup(int index) {
return this.operations[index].nonceGroup;
}
@Override
public long getNonce(int index) {
return this.operations[index].nonce;
}
@Override
public Mutation[] getMutationsForCoprocs() {
return null;
}
@Override
public boolean isInReplay() {
return true;
}
@Override
public long getOrigLogSeqNum() {
return this.origLogSeqNum;
}
@Override
public void startRegionOperation() throws IOException {
region.startRegionOperation(Operation.REPLAY_BATCH_MUTATE);
}
@Override
public void closeRegionOperation() throws IOException {
region.closeRegionOperation(Operation.REPLAY_BATCH_MUTATE);
}
/**
* During replay, there could exist column families which are removed between region server
* failure and replay
*/
@Override
protected void checkAndPreparePut(Put p) throws IOException {
Map> familyCellMap = p.getFamilyCellMap();
List nonExistentList = null;
for (byte[] family : familyCellMap.keySet()) {
if (!region.htableDescriptor.hasColumnFamily(family)) {
if (nonExistentList == null) {
nonExistentList = new ArrayList<>();
}
nonExistentList.add(family);
}
}
if (nonExistentList != null) {
for (byte[] family : nonExistentList) {
// Perhaps schema was changed between crash and replay
LOG.info("No family for {} omit from reply in region {}.", Bytes.toString(family), this);
familyCellMap.remove(family);
}
}
}
@Override
public void checkAndPrepare() throws IOException {
long now = EnvironmentEdgeManager.currentTime();
visitBatchOperations(true, this.size(), (int index) -> {
checkAndPrepareMutation(index, now);
return true;
});
}
@Override
public void prepareMiniBatchOperations(MiniBatchOperationInProgress miniBatchOp,
long timestamp, final List acquiredRowLocks) throws IOException {
visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> {
// update cell count
for (List cells : getMutation(index).getFamilyCellMap().values()) {
miniBatchOp.addCellCount(cells.size());
}
return true;
});
}
@Override
public WriteEntry writeMiniBatchOperationsToMemStore(
final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry)
throws IOException {
super.writeMiniBatchOperationsToMemStore(miniBatchOp, getOrigLogSeqNum());
return writeEntry;
}
@Override
public void completeMiniBatchOperations(
final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry)
throws IOException {
super.completeMiniBatchOperations(miniBatchOp, writeEntry);
region.mvcc.advanceTo(getOrigLogSeqNum());
}
}
public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, long nonceGroup,
long nonce) throws IOException {
// As it stands, this is used for 3 things
// * batchMutate with single mutation - put/delete/increment/append, separate or from
// checkAndMutate.
// * coprocessor calls (see ex. BulkDeleteEndpoint).
// So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd...
return batchMutate(new MutationBatchOperation(this, mutations, atomic, nonceGroup, nonce));
}
@Override
public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException {
// If the mutations has any Increment/Append operations, we need to do batchMutate atomically
boolean atomic =
Arrays.stream(mutations).anyMatch(m -> m instanceof Increment || m instanceof Append);
return batchMutate(mutations, atomic);
}
OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic) throws IOException {
return TraceUtil.trace(
() -> batchMutate(mutations, atomic, HConstants.NO_NONCE, HConstants.NO_NONCE),
() -> createRegionSpan("Region.batchMutate"));
}
public OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId)
throws IOException {
if (
!RegionReplicaUtil.isDefaultReplica(getRegionInfo())
&& replaySeqId < lastReplayedOpenRegionSeqId
) {
// if it is a secondary replica we should ignore these entries silently
// since they are coming out of order
if (LOG.isTraceEnabled()) {
LOG.trace(getRegionInfo().getEncodedName() + " : " + "Skipping " + mutations.length
+ " mutations with replaySeqId=" + replaySeqId
+ " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId);
for (MutationReplay mut : mutations) {
LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation);
}
}
OperationStatus[] statuses = new OperationStatus[mutations.length];
for (int i = 0; i < statuses.length; i++) {
statuses[i] = OperationStatus.SUCCESS;
}
return statuses;
}
return batchMutate(new ReplayBatchOperation(this, mutations, replaySeqId));
}
/**
* Perform a batch of mutations.
*
* Operations in a batch are stored with highest durability specified of for all operations in a
* batch, except for {@link Durability#SKIP_WAL}.
*
* This function is called from {@link #batchReplay(WALSplitUtil.MutationReplay[], long)} with
* {@link ReplayBatchOperation} instance and {@link #batchMutate(Mutation[])} with
* {@link MutationBatchOperation} instance as an argument. As the processing of replay batch and
* mutation batch is very similar, lot of code is shared by providing generic methods in base
* class {@link BatchOperation}. The logic for this method and
* {@link #doMiniBatchMutate(BatchOperation)} is implemented using methods in base class which are
* overridden by derived classes to implement special behavior.
* @param batchOp contains the list of mutations
* @return an array of OperationStatus which internally contains the OperationStatusCode and the
* exceptionMessage if any.
* @throws IOException if an IO problem is encountered
*/
private OperationStatus[] batchMutate(BatchOperation> batchOp) throws IOException {
boolean initialized = false;
batchOp.startRegionOperation();
try {
while (!batchOp.isDone()) {
if (!batchOp.isInReplay()) {
checkReadOnly();
}
checkResources();
if (!initialized) {
this.writeRequestsCount.add(batchOp.size());
// validate and prepare batch for write, for MutationBatchOperation it also calls CP
// prePut()/preDelete()/preIncrement()/preAppend() hooks
batchOp.checkAndPrepare();
initialized = true;
}
doMiniBatchMutate(batchOp);
requestFlushIfNeeded();
}
} finally {
if (rsServices != null && rsServices.getMetrics() != null) {
rsServices.getMetrics().updateWriteQueryMeter(this.htableDescriptor.getTableName(),
batchOp.size());
}
batchOp.closeRegionOperation();
}
return batchOp.retCodeDetails;
}
/**
* Called to do a piece of the batch that came in to {@link #batchMutate(Mutation[])} In here we
* also handle replay of edits on region recover. Also gets change in size brought about by
* applying {@code batchOp}.
*/
private void doMiniBatchMutate(BatchOperation> batchOp) throws IOException {
boolean success = false;
WALEdit walEdit = null;
WriteEntry writeEntry = null;
boolean locked = false;
// We try to set up a batch in the range [batchOp.nextIndexToProcess,lastIndexExclusive)
MiniBatchOperationInProgress miniBatchOp = null;
/** Keep track of the locks we hold so we can release them in finally clause */
List acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.size());
// Check for thread interrupt status in case we have been signaled from
// #interruptRegionOperation.
checkInterrupt();
try {
// STEP 1. Try to acquire as many locks as we can and build mini-batch of operations with
// locked rows
miniBatchOp = batchOp.lockRowsAndBuildMiniBatch(acquiredRowLocks);
// We've now grabbed as many mutations off the list as we can
// Ensure we acquire at least one.
if (miniBatchOp.getReadyToWriteCount() <= 0) {
// Nothing to put/delete/increment/append -- an exception in the above such as
// NoSuchColumnFamily?
return;
}
// Check for thread interrupt status in case we have been signaled from
// #interruptRegionOperation. Do it before we take the lock and disable interrupts for
// the WAL append.
checkInterrupt();
lock(this.updatesLock.readLock(), miniBatchOp.getReadyToWriteCount());
locked = true;
// From this point until memstore update this operation should not be interrupted.
disableInterrupts();
// STEP 2. Update mini batch of all operations in progress with LATEST_TIMESTAMP timestamp
// We should record the timestamp only after we have acquired the rowLock,
// otherwise, newer puts/deletes/increment/append are not guaranteed to have a newer
// timestamp
long now = EnvironmentEdgeManager.currentTime();
batchOp.prepareMiniBatchOperations(miniBatchOp, now, acquiredRowLocks);
// STEP 3. Build WAL edit
List> walEdits = batchOp.buildWALEdits(miniBatchOp);
// STEP 4. Append the WALEdits to WAL and sync.
for (Iterator> it = walEdits.iterator(); it.hasNext();) {
Pair nonceKeyWALEditPair = it.next();
walEdit = nonceKeyWALEditPair.getSecond();
NonceKey nonceKey = nonceKeyWALEditPair.getFirst();
if (walEdit != null && !walEdit.isEmpty()) {
writeEntry = doWALAppend(walEdit, batchOp.durability, batchOp.getClusterIds(), now,
nonceKey.getNonceGroup(), nonceKey.getNonce(), batchOp.getOrigLogSeqNum());
}
// Complete mvcc for all but last writeEntry (for replay case)
if (it.hasNext() && writeEntry != null) {
mvcc.complete(writeEntry);
writeEntry = null;
}
}
// STEP 5. Write back to memStore
// NOTE: writeEntry can be null here
writeEntry = batchOp.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry);
// STEP 6. Complete MiniBatchOperations: If required calls postBatchMutate() CP hook and
// complete mvcc for last writeEntry
batchOp.completeMiniBatchOperations(miniBatchOp, writeEntry);
writeEntry = null;
success = true;
} finally {
// Call complete rather than completeAndWait because we probably had error if walKey != null
if (writeEntry != null) mvcc.complete(writeEntry);
if (locked) {
this.updatesLock.readLock().unlock();
}
releaseRowLocks(acquiredRowLocks);
enableInterrupts();
final int finalLastIndexExclusive =
miniBatchOp != null ? miniBatchOp.getLastIndexExclusive() : batchOp.size();
final boolean finalSuccess = success;
batchOp.visitBatchOperations(true, finalLastIndexExclusive, (int i) -> {
Mutation mutation = batchOp.getMutation(i);
if (mutation instanceof Increment || mutation instanceof Append) {
if (finalSuccess) {
batchOp.retCodeDetails[i] =
new OperationStatus(OperationStatusCode.SUCCESS, batchOp.results[i]);
} else {
batchOp.retCodeDetails[i] = OperationStatus.FAILURE;
}
} else {
batchOp.retCodeDetails[i] =
finalSuccess ? OperationStatus.SUCCESS : OperationStatus.FAILURE;
}
return true;
});
batchOp.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, finalSuccess);
batchOp.nextIndexToProcess = finalLastIndexExclusive;
}
}
/**
* Returns effective durability from the passed durability and the table descriptor.
*/
private Durability getEffectiveDurability(Durability d) {
return d == Durability.USE_DEFAULT ? this.regionDurability : d;
}
@Override
@Deprecated
public boolean checkAndMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op,
ByteArrayComparable comparator, TimeRange timeRange, Mutation mutation) throws IOException {
CheckAndMutate checkAndMutate;
try {
CheckAndMutate.Builder builder = CheckAndMutate.newBuilder(row)
.ifMatches(family, qualifier, op, comparator.getValue()).timeRange(timeRange);
if (mutation instanceof Put) {
checkAndMutate = builder.build((Put) mutation);
} else if (mutation instanceof Delete) {
checkAndMutate = builder.build((Delete) mutation);
} else {
throw new DoNotRetryIOException(
"Unsupported mutate type: " + mutation.getClass().getSimpleName().toUpperCase());
}
} catch (IllegalArgumentException e) {
throw new DoNotRetryIOException(e.getMessage());
}
return checkAndMutate(checkAndMutate).isSuccess();
}
@Override
@Deprecated
public boolean checkAndMutate(byte[] row, Filter filter, TimeRange timeRange, Mutation mutation)
throws IOException {
CheckAndMutate checkAndMutate;
try {
CheckAndMutate.Builder builder =
CheckAndMutate.newBuilder(row).ifMatches(filter).timeRange(timeRange);
if (mutation instanceof Put) {
checkAndMutate = builder.build((Put) mutation);
} else if (mutation instanceof Delete) {
checkAndMutate = builder.build((Delete) mutation);
} else {
throw new DoNotRetryIOException(
"Unsupported mutate type: " + mutation.getClass().getSimpleName().toUpperCase());
}
} catch (IllegalArgumentException e) {
throw new DoNotRetryIOException(e.getMessage());
}
return checkAndMutate(checkAndMutate).isSuccess();
}
@Override
@Deprecated
public boolean checkAndRowMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op,
ByteArrayComparable comparator, TimeRange timeRange, RowMutations rm) throws IOException {
CheckAndMutate checkAndMutate;
try {
checkAndMutate = CheckAndMutate.newBuilder(row)
.ifMatches(family, qualifier, op, comparator.getValue()).timeRange(timeRange).build(rm);
} catch (IllegalArgumentException e) {
throw new DoNotRetryIOException(e.getMessage());
}
return checkAndMutate(checkAndMutate).isSuccess();
}
@Override
@Deprecated
public boolean checkAndRowMutate(byte[] row, Filter filter, TimeRange timeRange, RowMutations rm)
throws IOException {
CheckAndMutate checkAndMutate;
try {
checkAndMutate =
CheckAndMutate.newBuilder(row).ifMatches(filter).timeRange(timeRange).build(rm);
} catch (IllegalArgumentException e) {
throw new DoNotRetryIOException(e.getMessage());
}
return checkAndMutate(checkAndMutate).isSuccess();
}
@Override
public CheckAndMutateResult checkAndMutate(CheckAndMutate checkAndMutate) throws IOException {
return checkAndMutate(checkAndMutate, HConstants.NO_NONCE, HConstants.NO_NONCE);
}
public CheckAndMutateResult checkAndMutate(CheckAndMutate checkAndMutate, long nonceGroup,
long nonce) throws IOException {
return TraceUtil.trace(() -> checkAndMutateInternal(checkAndMutate, nonceGroup, nonce),
() -> createRegionSpan("Region.checkAndMutate"));
}
private CheckAndMutateResult checkAndMutateInternal(CheckAndMutate checkAndMutate,
long nonceGroup, long nonce) throws IOException {
byte[] row = checkAndMutate.getRow();
Filter filter = null;
byte[] family = null;
byte[] qualifier = null;
CompareOperator op = null;
ByteArrayComparable comparator = null;
if (checkAndMutate.hasFilter()) {
filter = checkAndMutate.getFilter();
} else {
family = checkAndMutate.getFamily();
qualifier = checkAndMutate.getQualifier();
op = checkAndMutate.getCompareOp();
comparator = new BinaryComparator(checkAndMutate.getValue());
}
TimeRange timeRange = checkAndMutate.getTimeRange();
Mutation mutation = null;
RowMutations rowMutations = null;
if (checkAndMutate.getAction() instanceof Mutation) {
mutation = (Mutation) checkAndMutate.getAction();
} else {
rowMutations = (RowMutations) checkAndMutate.getAction();
}
if (mutation != null) {
checkMutationType(mutation);
checkRow(mutation, row);
} else {
checkRow(rowMutations, row);
}
checkReadOnly();
// TODO, add check for value length also move this check to the client
checkResources();
startRegionOperation();
try {
Get get = new Get(row);
if (family != null) {
checkFamily(family);
get.addColumn(family, qualifier);
}
if (filter != null) {
get.setFilter(filter);
}
if (timeRange != null) {
get.setTimeRange(timeRange.getMin(), timeRange.getMax());
}
// Lock row - note that doBatchMutate will relock this row if called
checkRow(row, "doCheckAndRowMutate");
RowLock rowLock = getRowLock(get.getRow(), false, null);
try {
if (this.getCoprocessorHost() != null) {
CheckAndMutateResult result =
getCoprocessorHost().preCheckAndMutateAfterRowLock(checkAndMutate);
if (result != null) {
return result;
}
}
// NOTE: We used to wait here until mvcc caught up: mvcc.await();
// Supposition is that now all changes are done under row locks, then when we go to read,
// we'll get the latest on this row.
boolean matches = false;
long cellTs = 0;
try (RegionScanner scanner = getScanner(new Scan(get))) {
// NOTE: Please don't use HRegion.get() instead,
// because it will copy cells to heap. See HBASE-26036
List result = new ArrayList<>(1);
scanner.next(result);
if (filter != null) {
if (!result.isEmpty()) {
matches = true;
cellTs = result.get(0).getTimestamp();
}
} else {
boolean valueIsNull =
comparator.getValue() == null || comparator.getValue().length == 0;
if (result.isEmpty() && valueIsNull) {
matches = op != CompareOperator.NOT_EQUAL;
} else if (result.size() > 0 && valueIsNull) {
matches = (result.get(0).getValueLength() == 0) == (op != CompareOperator.NOT_EQUAL);
cellTs = result.get(0).getTimestamp();
} else if (result.size() == 1) {
Cell kv = result.get(0);
cellTs = kv.getTimestamp();
int compareResult = PrivateCellUtil.compareValue(kv, comparator);
matches = matches(op, compareResult);
}
}
}
// If matches, perform the mutation or the rowMutations
if (matches) {
// We have acquired the row lock already. If the system clock is NOT monotonically
// non-decreasing (see HBASE-14070) we should make sure that the mutation has a
// larger timestamp than what was observed via Get. doBatchMutate already does this, but
// there is no way to pass the cellTs. See HBASE-14054.
long now = EnvironmentEdgeManager.currentTime();
long ts = Math.max(now, cellTs); // ensure write is not eclipsed
byte[] byteTs = Bytes.toBytes(ts);
if (mutation != null) {
if (mutation instanceof Put) {
updateCellTimestamps(mutation.getFamilyCellMap().values(), byteTs);
}
// And else 'delete' is not needed since it already does a second get, and sets the
// timestamp from get (see prepareDeleteTimestamps).
} else {
for (Mutation m : rowMutations.getMutations()) {
if (m instanceof Put) {
updateCellTimestamps(m.getFamilyCellMap().values(), byteTs);
}
}
// And else 'delete' is not needed since it already does a second get, and sets the
// timestamp from get (see prepareDeleteTimestamps).
}
// All edits for the given row (across all column families) must happen atomically.
Result r;
if (mutation != null) {
r = mutate(mutation, true, nonceGroup, nonce).getResult();
} else {
r = mutateRow(rowMutations, nonceGroup, nonce);
}
this.checkAndMutateChecksPassed.increment();
return new CheckAndMutateResult(true, r);
}
this.checkAndMutateChecksFailed.increment();
return new CheckAndMutateResult(false, null);
} finally {
rowLock.release();
}
} finally {
closeRegionOperation();
}
}
private void checkMutationType(final Mutation mutation) throws DoNotRetryIOException {
if (
!(mutation instanceof Put) && !(mutation instanceof Delete)
&& !(mutation instanceof Increment) && !(mutation instanceof Append)
) {
throw new org.apache.hadoop.hbase.DoNotRetryIOException(
"Action must be Put or Delete or Increment or Delete");
}
}
private void checkRow(final Row action, final byte[] row) throws DoNotRetryIOException {
if (!Bytes.equals(row, action.getRow())) {
throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's getRow must match");
}
}
private boolean matches(final CompareOperator op, final int compareResult) {
boolean matches = false;
switch (op) {
case LESS:
matches = compareResult < 0;
break;
case LESS_OR_EQUAL:
matches = compareResult <= 0;
break;
case EQUAL:
matches = compareResult == 0;
break;
case NOT_EQUAL:
matches = compareResult != 0;
break;
case GREATER_OR_EQUAL:
matches = compareResult >= 0;
break;
case GREATER:
matches = compareResult > 0;
break;
default:
throw new RuntimeException("Unknown Compare op " + op.name());
}
return matches;
}
private OperationStatus mutate(Mutation mutation) throws IOException {
return mutate(mutation, false);
}
private OperationStatus mutate(Mutation mutation, boolean atomic) throws IOException {
return mutate(mutation, atomic, HConstants.NO_NONCE, HConstants.NO_NONCE);
}
private OperationStatus mutate(Mutation mutation, boolean atomic, long nonceGroup, long nonce)
throws IOException {
OperationStatus[] status =
this.batchMutate(new Mutation[] { mutation }, atomic, nonceGroup, nonce);
if (status[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) {
throw new FailedSanityCheckException(status[0].getExceptionMsg());
} else if (status[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) {
throw new NoSuchColumnFamilyException(status[0].getExceptionMsg());
} else if (status[0].getOperationStatusCode().equals(OperationStatusCode.STORE_TOO_BUSY)) {
throw new RegionTooBusyException(status[0].getExceptionMsg());
}
return status[0];
}
/**
* Complete taking the snapshot on the region. Writes the region info and adds references to the
* working snapshot directory. TODO for api consistency, consider adding another version with no
* {@link ForeignExceptionSnare} arg. (In the future other cancellable HRegion methods could
* eventually add a {@link ForeignExceptionSnare}, or we could do something fancier).
* @param desc snapshot description object
* @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to bail
* out. This is allowed to be null and will just be ignored in that case.
* @throws IOException if there is an external or internal error causing the snapshot to fail
*/
public void addRegionToSnapshot(SnapshotDescription desc, ForeignExceptionSnare exnSnare)
throws IOException {
Path rootDir = CommonFSUtils.getRootDir(conf);
Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir, conf);
SnapshotManifest manifest =
SnapshotManifest.create(conf, getFilesystem(), snapshotDir, desc, exnSnare);
manifest.addRegion(this);
}
private void updateSequenceId(final Iterable> cellItr, final long sequenceId)
throws IOException {
for (List cells : cellItr) {
if (cells == null) return;
for (Cell cell : cells) {
PrivateCellUtil.setSequenceId(cell, sequenceId);
}
}
}
/**
* Replace any cell timestamps set to {@link org.apache.hadoop.hbase.HConstants#LATEST_TIMESTAMP}
* provided current timestamp.
*/
private static void updateCellTimestamps(final Iterable> cellItr, final byte[] now)
throws IOException {
for (List cells : cellItr) {
if (cells == null) continue;
// Optimization: 'foreach' loop is not used. See:
// HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects
assert cells instanceof RandomAccess;
int listSize = cells.size();
for (int i = 0; i < listSize; i++) {
PrivateCellUtil.updateLatestStamp(cells.get(i), now);
}
}
}
/**
* Possibly rewrite incoming cell tags.
*/
private void rewriteCellTags(Map> familyMap, final Mutation m) {
// Check if we have any work to do and early out otherwise
// Update these checks as more logic is added here
if (m.getTTL() == Long.MAX_VALUE) {
return;
}
// From this point we know we have some work to do
for (Map.Entry> e : familyMap.entrySet()) {
List cells = e.getValue();
assert cells instanceof RandomAccess;
int listSize = cells.size();
for (int i = 0; i < listSize; i++) {
Cell cell = cells.get(i);
List newTags = TagUtil.carryForwardTags(null, cell);
newTags = TagUtil.carryForwardTTLTag(newTags, m.getTTL());
// Rewrite the cell with the updated set of tags
cells.set(i, PrivateCellUtil.createCell(cell, newTags));
}
}
}
/**
* Check if resources to support an update.
*
* We throw RegionTooBusyException if above memstore limit and expect client to retry using some
* kind of backoff
*/
private void checkResources() throws RegionTooBusyException {
// If catalog region, do not impose resource constraints or block updates.
if (this.getRegionInfo().isMetaRegion()) {
return;
}
MemStoreSize mss = this.memStoreSizing.getMemStoreSize();
if (mss.getHeapSize() + mss.getOffHeapSize() > this.blockingMemStoreSize) {
blockedRequestsCount.increment();
requestFlush();
// Don't print current limit because it will vary too much. The message is used as a key
// over in RetriesExhaustedWithDetailsException processing.
final String regionName =
this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getEncodedName();
final String serverName = this.getRegionServerServices() == null
? "unknown"
: (this.getRegionServerServices().getServerName() == null
? "unknown"
: this.getRegionServerServices().getServerName().toString());
RegionTooBusyException rtbe = new RegionTooBusyException("Over memstore limit="
+ org.apache.hadoop.hbase.procedure2.util.StringUtils.humanSize(this.blockingMemStoreSize)
+ ", regionName=" + regionName + ", server=" + serverName);
LOG.warn("Region is too busy due to exceeding memstore size limit.", rtbe);
throw rtbe;
}
}
/**
* @throws IOException Throws exception if region is in read-only mode.
*/
private void checkReadOnly() throws IOException {
if (isReadOnly()) {
throw new DoNotRetryIOException("region is read only");
}
}
private void checkReadsEnabled() throws IOException {
if (!this.writestate.readsEnabled) {
throw new IOException(getRegionInfo().getEncodedName()
+ ": The region's reads are disabled. Cannot serve the request");
}
}
public void setReadsEnabled(boolean readsEnabled) {
if (readsEnabled && !this.writestate.readsEnabled) {
LOG.info("Enabling reads for {}", getRegionInfo().getEncodedName());
}
this.writestate.setReadsEnabled(readsEnabled);
}
/**
* @param delta If we are doing delta changes -- e.g. increment/append -- then this flag will be
* set; when set we will run operations that make sense in the increment/append
* scenario but that do not make sense otherwise.
* @see #applyToMemStore(HStore, Cell, MemStoreSizing)
*/
private void applyToMemStore(HStore store, List cells, boolean delta,
MemStoreSizing memstoreAccounting) throws IOException {
// Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!!
boolean upsert = delta && store.getColumnFamilyDescriptor().getMaxVersions() == 1;
if (upsert) {
store.upsert(cells, getSmallestReadPoint(), memstoreAccounting);
} else {
store.add(cells, memstoreAccounting);
}
}
/**
* @see #applyToMemStore(HStore, List, boolean, MemStoreSizing)
*/
private void applyToMemStore(HStore store, Cell cell, MemStoreSizing memstoreAccounting)
throws IOException {
// Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!!
if (store == null) {
checkFamily(CellUtil.cloneFamily(cell));
// Unreachable because checkFamily will throw exception
}
store.add(cell, memstoreAccounting);
}
/**
* Check the collection of families for validity.
*/
public void checkFamilies(Collection families) throws NoSuchColumnFamilyException {
for (byte[] family : families) {
checkFamily(family);
}
}
/**
* Check the collection of families for valid timestamps
* @param now current timestamp
*/
public void checkTimestamps(final Map> familyMap, long now)
throws FailedSanityCheckException {
if (timestampSlop == HConstants.LATEST_TIMESTAMP) {
return;
}
long maxTs = now + timestampSlop;
for (List kvs : familyMap.values()) {
// Optimization: 'foreach' loop is not used. See:
// HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects
assert kvs instanceof RandomAccess;
int listSize = kvs.size();
for (int i = 0; i < listSize; i++) {
Cell cell = kvs.get(i);
// see if the user-side TS is out of range. latest = server-side
long ts = cell.getTimestamp();
if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) {
throw new FailedSanityCheckException(
"Timestamp for KV out of range " + cell + " (too.new=" + timestampSlop + ")");
}
}
}
}
/*
* @return True if size is over the flush threshold
*/
private boolean isFlushSize(MemStoreSize size) {
return size.getHeapSize() + size.getOffHeapSize() > getMemStoreFlushSize();
}
private void deleteRecoveredEdits(FileSystem fs, Iterable files) throws IOException {
for (Path file : files) {
if (!fs.delete(file, false)) {
LOG.error("Failed delete of {}", file);
} else {
LOG.debug("Deleted recovered.edits file={}", file);
}
}
}
/**
* Read the edits put under this region by wal splitting process. Put the recovered edits back up
* into this region.
*
* We can ignore any wal message that has a sequence ID that's equal to or lower than minSeqId.
* (Because we know such messages are already reflected in the HFiles.)
*
* While this is running we are putting pressure on memory yet we are outside of our usual
* accounting because we are not yet an onlined region (this stuff is being run as part of Region
* initialization). This means that if we're up against global memory limits, we'll not be flagged
* to flush because we are not online. We can't be flushed by usual mechanisms anyways; we're not
* yet online so our relative sequenceids are not yet aligned with WAL sequenceids -- not till we
* come up online, post processing of split edits.
*
* But to help relieve memory pressure, at least manage our own heap size flushing if are in
* excess of per-region limits. Flushing, though, we have to be careful and avoid using the
* regionserver/wal sequenceid. Its running on a different line to whats going on in here in this
* region context so if we crashed replaying these edits, but in the midst had a flush that used
* the regionserver wal with a sequenceid in excess of whats going on in here in this region and
* with its split editlogs, then we could miss edits the next time we go to recover. So, we have
* to flush inline, using seqids that make sense in a this single region context only -- until we
* online.
* @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of the maxSeqId
* for the store to be applied, else its skipped.
* @return the sequence id of the last edit added to this region out of the recovered edits log or
* minSeqId if nothing added from editlogs.
*/
long replayRecoveredEditsIfAny(Map maxSeqIdInStores,
final CancelableProgressable reporter, final MonitoredTask status) throws IOException {
long minSeqIdForTheRegion = -1;
for (Long maxSeqIdInStore : maxSeqIdInStores.values()) {
if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) {
minSeqIdForTheRegion = maxSeqIdInStore;
}
}
long seqId = minSeqIdForTheRegion;
String specialRecoveredEditsDirStr = conf.get(SPECIAL_RECOVERED_EDITS_DIR);
if (org.apache.commons.lang3.StringUtils.isBlank(specialRecoveredEditsDirStr)) {
FileSystem walFS = getWalFileSystem();
FileSystem rootFS = getFilesystem();
Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf, getRegionInfo().getTable(),
getRegionInfo().getEncodedName());
Path regionWALDir = getWALRegionDir();
Path regionDir =
FSUtils.getRegionDirFromRootDir(CommonFSUtils.getRootDir(conf), getRegionInfo());
// We made a mistake in HBASE-20734 so we need to do this dirty hack...
NavigableSet filesUnderWrongRegionWALDir =
WALSplitUtil.getSplitEditFilesSorted(walFS, wrongRegionWALDir);
seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS,
filesUnderWrongRegionWALDir, reporter, regionDir));
// This is to ensure backwards compatability with HBASE-20723 where recovered edits can appear
// under the root dir even if walDir is set.
NavigableSet filesUnderRootDir = Collections.emptyNavigableSet();
if (!regionWALDir.equals(regionDir)) {
filesUnderRootDir = WALSplitUtil.getSplitEditFilesSorted(rootFS, regionDir);
seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, rootFS,
filesUnderRootDir, reporter, regionDir));
}
NavigableSet files = WALSplitUtil.getSplitEditFilesSorted(walFS, regionWALDir);
seqId = Math.max(seqId,
replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS, files, reporter, regionWALDir));
if (seqId > minSeqIdForTheRegion) {
// Then we added some edits to memory. Flush and cleanup split edit files.
internalFlushcache(null, seqId, stores.values(), status, false,
FlushLifeCycleTracker.DUMMY);
}
// Now delete the content of recovered edits. We're done w/ them.
if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) {
// For debugging data loss issues!
// If this flag is set, make use of the hfile archiving by making recovered.edits a fake
// column family. Have to fake out file type too by casting our recovered.edits as
// storefiles
String fakeFamilyName = WALSplitUtil.getRegionDirRecoveredEditsDir(regionWALDir).getName();
Set fakeStoreFiles = new HashSet<>(files.size());
for (Path file : files) {
fakeStoreFiles.add(new HStoreFile(walFS, file, this.conf, null, null, true));
}
getRegionWALFileSystem().archiveRecoveredEdits(fakeFamilyName, fakeStoreFiles);
} else {
deleteRecoveredEdits(walFS, Iterables.concat(files, filesUnderWrongRegionWALDir));
deleteRecoveredEdits(rootFS, filesUnderRootDir);
}
} else {
Path recoveredEditsDir = new Path(specialRecoveredEditsDirStr);
FileSystem fs = recoveredEditsDir.getFileSystem(conf);
FileStatus[] files = fs.listStatus(recoveredEditsDir);
LOG.debug("Found {} recovered edits file(s) under {}", files == null ? 0 : files.length,
recoveredEditsDir);
if (files != null) {
for (FileStatus file : files) {
// it is safe to trust the zero-length in this case because we've been through rename and
// lease recovery in the above.
if (isZeroLengthThenDelete(fs, file, file.getPath())) {
continue;
}
seqId =
Math.max(seqId, replayRecoveredEdits(file.getPath(), maxSeqIdInStores, reporter, fs));
}
}
if (seqId > minSeqIdForTheRegion) {
// Then we added some edits to memory. Flush and cleanup split edit files.
internalFlushcache(null, seqId, stores.values(), status, false,
FlushLifeCycleTracker.DUMMY);
}
deleteRecoveredEdits(fs,
Stream.of(files).map(FileStatus::getPath).collect(Collectors.toList()));
}
return seqId;
}
private long replayRecoveredEditsForPaths(long minSeqIdForTheRegion, FileSystem fs,
final NavigableSet files, final CancelableProgressable reporter, final Path regionDir)
throws IOException {
long seqid = minSeqIdForTheRegion;
if (LOG.isDebugEnabled()) {
LOG.debug("Found " + (files == null ? 0 : files.size()) + " recovered edits file(s) under "
+ regionDir);
}
if (files == null || files.isEmpty()) {
return minSeqIdForTheRegion;
}
for (Path edits : files) {
if (edits == null || !fs.exists(edits)) {
LOG.warn("Null or non-existent edits file: " + edits);
continue;
}
if (isZeroLengthThenDelete(fs, fs.getFileStatus(edits), edits)) {
continue;
}
long maxSeqId;
String fileName = edits.getName();
maxSeqId = Math.abs(Long.parseLong(fileName));
if (maxSeqId <= minSeqIdForTheRegion) {
if (LOG.isDebugEnabled()) {
String msg = "Maximum sequenceid for this wal is " + maxSeqId
+ " and minimum sequenceid for the region " + this + " is " + minSeqIdForTheRegion
+ ", skipped the whole file, path=" + edits;
LOG.debug(msg);
}
continue;
}
try {
// replay the edits. Replay can return -1 if everything is skipped, only update
// if seqId is greater
seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter, fs));
} catch (IOException e) {
handleException(fs, edits, e);
}
}
return seqid;
}
private void handleException(FileSystem fs, Path edits, IOException e) throws IOException {
boolean skipErrors = conf.getBoolean(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS,
conf.getBoolean("hbase.skip.errors", HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS));
if (conf.get("hbase.skip.errors") != null) {
LOG.warn("The property 'hbase.skip.errors' has been deprecated. Please use "
+ HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead.");
}
if (skipErrors) {
Path p = WALSplitUtil.moveAsideBadEditsFile(fs, edits);
LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + "=true so continuing. Renamed "
+ edits + " as " + p, e);
} else {
throw e;
}
}
/**
* @param edits File of recovered edits.
* @param maxSeqIdInStores Maximum sequenceid found in each store. Edits in wal must be larger
* than this to be replayed for each store.
* @return the sequence id of the last edit added to this region out of the recovered edits log or
* minSeqId if nothing added from editlogs.
*/
private long replayRecoveredEdits(final Path edits, Map maxSeqIdInStores,
final CancelableProgressable reporter, FileSystem fs) throws IOException {
String msg = "Replaying edits from " + edits;
LOG.info(msg);
MonitoredTask status = TaskMonitor.get().createStatus(msg);
status.setStatus("Opening recovered edits");
WAL.Reader reader = null;
try {
reader = WALFactory.createReader(fs, edits, conf);
long currentEditSeqId = -1;
long currentReplaySeqId = -1;
long firstSeqIdInLog = -1;
long skippedEdits = 0;
long editsCount = 0;
long intervalEdits = 0;
WAL.Entry entry;
HStore store = null;
boolean reported_once = false;
ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager();
try {
// How many edits seen before we check elapsed time
int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000);
// How often to send a progress report (default 1/2 master timeout)
int period = this.conf.getInt("hbase.hstore.report.period", 300000);
long lastReport = EnvironmentEdgeManager.currentTime();
if (coprocessorHost != null) {
coprocessorHost.preReplayWALs(this.getRegionInfo(), edits);
}
while ((entry = reader.next()) != null) {
WALKey key = entry.getKey();
WALEdit val = entry.getEdit();
if (ng != null) { // some test, or nonces disabled
ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime());
}
if (reporter != null) {
intervalEdits += val.size();
if (intervalEdits >= interval) {
// Number of edits interval reached
intervalEdits = 0;
long cur = EnvironmentEdgeManager.currentTime();
if (lastReport + period <= cur) {
status.setStatus(
"Replaying edits..." + " skipped=" + skippedEdits + " edits=" + editsCount);
// Timeout reached
if (!reporter.progress()) {
msg = "Progressable reporter failed, stopping replay for region " + this;
LOG.warn(msg);
status.abort(msg);
throw new IOException(msg);
}
reported_once = true;
lastReport = cur;
}
}
}
if (firstSeqIdInLog == -1) {
firstSeqIdInLog = key.getSequenceId();
}
if (currentEditSeqId > key.getSequenceId()) {
// when this condition is true, it means we have a serious defect because we need to
// maintain increasing SeqId for WAL edits per region
LOG.error(getRegionInfo().getEncodedName() + " : " + "Found decreasing SeqId. PreId="
+ currentEditSeqId + " key=" + key + "; edit=" + val);
} else {
currentEditSeqId = key.getSequenceId();
}
currentReplaySeqId =
(key.getOrigLogSeqNum() > 0) ? key.getOrigLogSeqNum() : currentEditSeqId;
// Start coprocessor replay here. The coprocessor is for each WALEdit
// instead of a KeyValue.
if (coprocessorHost != null) {
status.setStatus("Running pre-WAL-restore hook in coprocessors");
if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
// if bypass this wal entry, ignore it ...
continue;
}
}
boolean checkRowWithinBoundary = false;
// Check this edit is for this region.
if (
!Bytes.equals(key.getEncodedRegionName(), this.getRegionInfo().getEncodedNameAsBytes())
) {
checkRowWithinBoundary = true;
}
boolean flush = false;
MemStoreSizing memStoreSizing = new NonThreadSafeMemStoreSizing();
for (Cell cell : val.getCells()) {
// Check this edit is for me. Also, guard against writing the special
// METACOLUMN info such as HBASE::CACHEFLUSH entries
if (WALEdit.isMetaEditFamily(cell)) {
// if region names don't match, skipp replaying compaction marker
if (!checkRowWithinBoundary) {
// this is a special edit, we should handle it
CompactionDescriptor compaction = WALEdit.getCompaction(cell);
if (compaction != null) {
// replay the compaction
replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE);
}
}
skippedEdits++;
continue;
}
// Figure which store the edit is meant for.
if (
store == null
|| !CellUtil.matchingFamily(cell, store.getColumnFamilyDescriptor().getName())
) {
store = getStore(cell);
}
if (store == null) {
// This should never happen. Perhaps schema was changed between
// crash and redeploy?
LOG.warn("No family for cell {} in region {}", cell, this);
skippedEdits++;
continue;
}
if (
checkRowWithinBoundary && !rowIsInRange(this.getRegionInfo(), cell.getRowArray(),
cell.getRowOffset(), cell.getRowLength())
) {
LOG.warn("Row of {} is not within region boundary for region {}", cell, this);
skippedEdits++;
continue;
}
// Now, figure if we should skip this edit.
if (
key.getSequenceId()
<= maxSeqIdInStores.get(store.getColumnFamilyDescriptor().getName())
) {
skippedEdits++;
continue;
}
PrivateCellUtil.setSequenceId(cell, currentReplaySeqId);
restoreEdit(store, cell, memStoreSizing);
editsCount++;
}
MemStoreSize mss = memStoreSizing.getMemStoreSize();
incMemStoreSize(mss);
flush = isFlushSize(this.memStoreSizing.getMemStoreSize());
if (flush) {
internalFlushcache(null, currentEditSeqId, stores.values(), status, false,
FlushLifeCycleTracker.DUMMY);
}
if (coprocessorHost != null) {
coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
}
}
if (coprocessorHost != null) {
coprocessorHost.postReplayWALs(this.getRegionInfo(), edits);
}
} catch (EOFException eof) {
Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits);
msg = "EnLongAddered EOF. Most likely due to Master failure during "
+ "wal splitting, so we have this data in another edit. Continuing, but renaming " + edits
+ " as " + p + " for region " + this;
LOG.warn(msg, eof);
status.abort(msg);
} catch (IOException ioe) {
// If the IOE resulted from bad file format,
// then this problem is idempotent and retrying won't help
if (ioe.getCause() instanceof ParseException) {
Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits);
msg =
"File corruption enLongAddered! " + "Continuing, but renaming " + edits + " as " + p;
LOG.warn(msg, ioe);
status.setStatus(msg);
} else {
status.abort(StringUtils.stringifyException(ioe));
// other IO errors may be transient (bad network connection,
// checksum exception on one datanode, etc). throw & retry
throw ioe;
}
}
if (reporter != null && !reported_once) {
reporter.progress();
}
msg = "Applied " + editsCount + ", skipped " + skippedEdits + ", firstSequenceIdInLog="
+ firstSeqIdInLog + ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits;
status.markComplete(msg);
LOG.debug(msg);
return currentEditSeqId;
} finally {
status.cleanup();
if (reader != null) {
reader.close();
}
}
}
/**
* Call to complete a compaction. Its for the case where we find in the WAL a compaction that was
* not finished. We could find one recovering a WAL after a regionserver crash. See HBASE-2331.
*/
void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles,
boolean removeFiles, long replaySeqId) throws IOException {
try {
checkTargetRegion(compaction.getEncodedRegionName().toByteArray(),
"Compaction marker from WAL ", compaction);
} catch (WrongRegionException wre) {
if (RegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
// skip the compaction marker since it is not for this region
return;
}
throw wre;
}
synchronized (writestate) {
if (replaySeqId < lastReplayedOpenRegionSeqId) {
LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying compaction event :"
+ TextFormat.shortDebugString(compaction) + " because its sequence id " + replaySeqId
+ " is smaller than this regions " + "lastReplayedOpenRegionSeqId of "
+ lastReplayedOpenRegionSeqId);
return;
}
if (replaySeqId < lastReplayedCompactionSeqId) {
LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying compaction event :"
+ TextFormat.shortDebugString(compaction) + " because its sequence id " + replaySeqId
+ " is smaller than this regions " + "lastReplayedCompactionSeqId of "
+ lastReplayedCompactionSeqId);
return;
} else {
lastReplayedCompactionSeqId = replaySeqId;
}
if (LOG.isDebugEnabled()) {
LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying compaction marker "
+ TextFormat.shortDebugString(compaction) + " with seqId=" + replaySeqId
+ " and lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId);
}
startRegionOperation(Operation.REPLAY_EVENT);
try {
HStore store = this.getStore(compaction.getFamilyName().toByteArray());
if (store == null) {
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Found Compaction WAL edit for deleted family:"
+ Bytes.toString(compaction.getFamilyName().toByteArray()));
return;
}
store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles);
logRegionFiles();
} catch (FileNotFoundException ex) {
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "At least one of the store files in compaction: "
+ TextFormat.shortDebugString(compaction)
+ " doesn't exist any more. Skip loading the file(s)", ex);
} finally {
closeRegionOperation(Operation.REPLAY_EVENT);
}
}
}
void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException {
checkTargetRegion(flush.getEncodedRegionName().toByteArray(), "Flush marker from WAL ", flush);
if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
return; // if primary nothing to do
}
if (LOG.isDebugEnabled()) {
LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying flush marker "
+ TextFormat.shortDebugString(flush));
}
startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close
try {
FlushAction action = flush.getAction();
switch (action) {
case START_FLUSH:
replayWALFlushStartMarker(flush);
break;
case COMMIT_FLUSH:
replayWALFlushCommitMarker(flush);
break;
case ABORT_FLUSH:
replayWALFlushAbortMarker(flush);
break;
case CANNOT_FLUSH:
replayWALFlushCannotFlushMarker(flush, replaySeqId);
break;
default:
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Received a flush event with unknown action, ignoring. "
+ TextFormat.shortDebugString(flush));
break;
}
logRegionFiles();
} finally {
closeRegionOperation(Operation.REPLAY_EVENT);
}
}
/**
* Replay the flush marker from primary region by creating a corresponding snapshot of the store
* memstores, only if the memstores do not have a higher seqId from an earlier wal edit (because
* the events may be coming out of order).
*/
PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException {
long flushSeqId = flush.getFlushSequenceNumber();
HashSet storesToFlush = new HashSet<>();
for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
byte[] family = storeFlush.getFamilyName().toByteArray();
HStore store = getStore(family);
if (store == null) {
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Received a flush start marker from primary, but the family is not found. Ignoring"
+ " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush));
continue;
}
storesToFlush.add(store);
}
MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this);
// we will use writestate as a coarse-grain lock for all the replay events
// (flush, compaction, region open etc)
synchronized (writestate) {
try {
if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :"
+ TextFormat.shortDebugString(flush)
+ " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
+ " of " + lastReplayedOpenRegionSeqId);
return null;
}
if (numMutationsWithoutWAL.sum() > 0) {
numMutationsWithoutWAL.reset();
dataInMemoryWithoutWAL.reset();
}
if (!writestate.flushing) {
// we do not have an active snapshot and corresponding this.prepareResult. This means
// we can just snapshot our memstores and continue as normal.
// invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal
PrepareFlushResult prepareResult = internalPrepareFlushCache(null, flushSeqId,
storesToFlush, status, false, FlushLifeCycleTracker.DUMMY);
if (prepareResult.result == null) {
// save the PrepareFlushResult so that we can use it later from commit flush
this.writestate.flushing = true;
this.prepareFlushResult = prepareResult;
status.markComplete("Flush prepare successful");
if (LOG.isDebugEnabled()) {
LOG.debug(getRegionInfo().getEncodedName() + " : " + " Prepared flush with seqId:"
+ flush.getFlushSequenceNumber());
}
} else {
// special case empty memstore. We will still save the flush result in this case, since
// our memstore ie empty, but the primary is still flushing
if (
prepareResult.getResult().getResult()
== FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY
) {
this.writestate.flushing = true;
this.prepareFlushResult = prepareResult;
if (LOG.isDebugEnabled()) {
LOG.debug(getRegionInfo().getEncodedName() + " : "
+ " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber());
}
}
status.abort("Flush prepare failed with " + prepareResult.result);
// nothing much to do. prepare flush failed because of some reason.
}
return prepareResult;
} else {
// we already have an active snapshot.
if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) {
// They define the same flush. Log and continue.
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Received a flush prepare marker with the same seqId: "
+ +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
+ prepareFlushResult.flushOpSeqId + ". Ignoring");
// ignore
} else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) {
// We received a flush with a smaller seqNum than what we have prepared. We can only
// ignore this prepare flush request.
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Received a flush prepare marker with a smaller seqId: "
+ +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
+ prepareFlushResult.flushOpSeqId + ". Ignoring");
// ignore
} else {
// We received a flush with a larger seqNum than what we have prepared
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Received a flush prepare marker with a larger seqId: "
+ +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
+ prepareFlushResult.flushOpSeqId + ". Ignoring");
// We do not have multiple active snapshots in the memstore or a way to merge current
// memstore snapshot with the contents and resnapshot for now. We cannot take
// another snapshot and drop the previous one because that will cause temporary
// data loss in the secondary. So we ignore this for now, deferring the resolution
// to happen when we see the corresponding flush commit marker. If we have a memstore
// snapshot with x, and later received another prepare snapshot with y (where x < y),
// when we see flush commit for y, we will drop snapshot for x, and can also drop all
// the memstore edits if everything in memstore is < y. This is the usual case for
// RS crash + recovery where we might see consequtive prepare flush wal markers.
// Otherwise, this will cause more memory to be used in secondary replica until a
// further prapare + commit flush is seen and replayed.
}
}
} finally {
status.cleanup();
writestate.notifyAll();
}
}
return null;
}
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY",
justification = "Intentional; post memstore flush")
void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException {
MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this);
// check whether we have the memstore snapshot with the corresponding seqId. Replay to
// secondary region replicas are in order, except for when the region moves or then the
// region server crashes. In those cases, we may receive replay requests out of order from
// the original seqIds.
synchronized (writestate) {
try {
if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :"
+ TextFormat.shortDebugString(flush)
+ " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
+ " of " + lastReplayedOpenRegionSeqId);
return;
}
if (writestate.flushing) {
PrepareFlushResult prepareFlushResult = this.prepareFlushResult;
if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) {
if (LOG.isDebugEnabled()) {
LOG.debug(getRegionInfo().getEncodedName() + " : "
+ "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
+ " and a previous prepared snapshot was found");
}
// This is the regular case where we received commit flush after prepare flush
// corresponding to the same seqId.
replayFlushInStores(flush, prepareFlushResult, true);
// Set down the memstore size by amount of flush.
this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize());
this.prepareFlushResult = null;
writestate.flushing = false;
} else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) {
// This should not happen normally. However, lets be safe and guard against these cases
// we received a flush commit with a smaller seqId than what we have prepared
// we will pick the flush file up from this commit (if we have not seen it), but we
// will not drop the memstore
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Received a flush commit marker with smaller seqId: "
+ flush.getFlushSequenceNumber() + " than what we have prepared with seqId: "
+ prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping"
+ " prepared memstore snapshot");
replayFlushInStores(flush, prepareFlushResult, false);
// snapshot is not dropped, so memstore sizes should not be decremented
// we still have the prepared snapshot, flushing should still be true
} else {
// This should not happen normally. However, lets be safe and guard against these cases
// we received a flush commit with a larger seqId than what we have prepared
// we will pick the flush file for this. We will also obtain the updates lock and
// look for contents of the memstore to see whether we have edits after this seqId.
// If not, we will drop all the memstore edits and the snapshot as well.
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Received a flush commit marker with larger seqId: "
+ flush.getFlushSequenceNumber() + " than what we have prepared with seqId: "
+ prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared"
+ " memstore snapshot");
replayFlushInStores(flush, prepareFlushResult, true);
// Set down the memstore size by amount of flush.
this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize());
// Inspect the memstore contents to see whether the memstore contains only edits
// with seqId smaller than the flush seqId. If so, we can discard those edits.
dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
this.prepareFlushResult = null;
writestate.flushing = false;
}
// If we were waiting for observing a flush or region opening event for not showing
// partial data after a secondary region crash, we can allow reads now. We can only make
// sure that we are not showing partial data (for example skipping some previous edits)
// until we observe a full flush start and flush commit. So if we were not able to find
// a previous flush we will not enable reads now.
this.setReadsEnabled(true);
} else {
LOG.warn(
getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker with seqId:"
+ flush.getFlushSequenceNumber() + ", but no previous prepared snapshot was found");
// There is no corresponding prepare snapshot from before.
// We will pick up the new flushed file
replayFlushInStores(flush, null, false);
// Inspect the memstore contents to see whether the memstore contains only edits
// with seqId smaller than the flush seqId. If so, we can discard those edits.
dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
}
status.markComplete("Flush commit successful");
// Update the last flushed sequence id for region.
this.maxFlushedSeqId = flush.getFlushSequenceNumber();
// advance the mvcc read point so that the new flushed file is visible.
mvcc.advanceTo(flush.getFlushSequenceNumber());
} catch (FileNotFoundException ex) {
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "At least one of the store files in flush: " + TextFormat.shortDebugString(flush)
+ " doesn't exist any more. Skip loading the file(s)", ex);
} finally {
status.cleanup();
writestate.notifyAll();
}
}
// C. Finally notify anyone waiting on memstore to clear:
// e.g. checkResources().
synchronized (this) {
notifyAll(); // FindBugs NN_NAKED_NOTIFY
}
}
/**
* Replays the given flush descriptor by opening the flush files in stores and dropping the
* memstore snapshots if requested.
*/
private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult,
boolean dropMemstoreSnapshot) throws IOException {
for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
byte[] family = storeFlush.getFamilyName().toByteArray();
HStore store = getStore(family);
if (store == null) {
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Received a flush commit marker from primary, but the family is not found."
+ "Ignoring StoreFlushDescriptor:" + storeFlush);
continue;
}
List flushFiles = storeFlush.getFlushOutputList();
StoreFlushContext ctx = null;
long startTime = EnvironmentEdgeManager.currentTime();
if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) {
ctx = store.createFlushContext(flush.getFlushSequenceNumber(), FlushLifeCycleTracker.DUMMY);
} else {
ctx = prepareFlushResult.storeFlushCtxs.get(family);
startTime = prepareFlushResult.startTime;
}
if (ctx == null) {
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Unexpected: flush commit marker received from store " + Bytes.toString(family)
+ " but no associated flush context. Ignoring");
continue;
}
ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush
// Record latest flush time
this.lastStoreFlushTimeMap.put(store, startTime);
}
}
private long loadRecoveredHFilesIfAny(Collection stores) throws IOException {
Path regionDir = fs.getRegionDir();
long maxSeqId = -1;
for (HStore store : stores) {
String familyName = store.getColumnFamilyName();
FileStatus[] files =
WALSplitUtil.getRecoveredHFiles(fs.getFileSystem(), regionDir, familyName);
if (files != null && files.length != 0) {
for (FileStatus file : files) {
Path filePath = file.getPath();
// If file length is zero then delete it
if (isZeroLengthThenDelete(fs.getFileSystem(), file, filePath)) {
continue;
}
try {
HStoreFile storefile = store.tryCommitRecoveredHFile(file.getPath());
maxSeqId = Math.max(maxSeqId, storefile.getReader().getSequenceID());
} catch (IOException e) {
handleException(fs.getFileSystem(), filePath, e);
continue;
}
}
if (this.rsServices != null && store.needsCompaction()) {
this.rsServices.getCompactionRequestor().requestCompaction(this, store,
"load recovered hfiles request compaction", Store.PRIORITY_USER + 1,
CompactionLifeCycleTracker.DUMMY, null);
}
}
}
return maxSeqId;
}
/**
* Be careful, this method will drop all data in the memstore of this region. Currently, this
* method is used to drop memstore to prevent memory leak when replaying recovered.edits while
* opening region.
*/
private MemStoreSize dropMemStoreContents() throws IOException {
MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing();
this.updatesLock.writeLock().lock();
try {
for (HStore s : stores.values()) {
MemStoreSize memStoreSize = doDropStoreMemStoreContentsForSeqId(s, HConstants.NO_SEQNUM);
LOG.info("Drop memstore for Store " + s.getColumnFamilyName() + " in region "
+ this.getRegionInfo().getRegionNameAsString() + " , dropped memstoresize: ["
+ memStoreSize + " }");
totalFreedSize.incMemStoreSize(memStoreSize);
}
return totalFreedSize.getMemStoreSize();
} finally {
this.updatesLock.writeLock().unlock();
}
}
/**
* Drops the memstore contents after replaying a flush descriptor or region open event replay if
* the memstore edits have seqNums smaller than the given seq id
*/
private MemStoreSize dropMemStoreContentsForSeqId(long seqId, HStore store) throws IOException {
MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing();
this.updatesLock.writeLock().lock();
try {
long currentSeqId = mvcc.getReadPoint();
if (seqId >= currentSeqId) {
// then we can drop the memstore contents since everything is below this seqId
LOG.info(getRegionInfo().getEncodedName() + " : "
+ "Dropping memstore contents as well since replayed flush seqId: " + seqId
+ " is greater than current seqId:" + currentSeqId);
// Prepare flush (take a snapshot) and then abort (drop the snapshot)
if (store == null) {
for (HStore s : stores.values()) {
totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(s, currentSeqId));
}
} else {
totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(store, currentSeqId));
}
} else {
LOG.info(getRegionInfo().getEncodedName() + " : "
+ "Not dropping memstore contents since replayed flush seqId: " + seqId
+ " is smaller than current seqId:" + currentSeqId);
}
} finally {
this.updatesLock.writeLock().unlock();
}
return totalFreedSize.getMemStoreSize();
}
private MemStoreSize doDropStoreMemStoreContentsForSeqId(HStore s, long currentSeqId)
throws IOException {
MemStoreSize flushableSize = s.getFlushableSize();
this.decrMemStoreSize(flushableSize);
StoreFlushContext ctx = s.createFlushContext(currentSeqId, FlushLifeCycleTracker.DUMMY);
ctx.prepare();
ctx.abort();
return flushableSize;
}
private void replayWALFlushAbortMarker(FlushDescriptor flush) {
// nothing to do for now. A flush abort will cause a RS abort which means that the region
// will be opened somewhere else later. We will see the region open event soon, and replaying
// that will drop the snapshot
}
private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) {
synchronized (writestate) {
if (this.lastReplayedOpenRegionSeqId > replaySeqId) {
LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :"
+ TextFormat.shortDebugString(flush) + " because its sequence id " + replaySeqId
+ " is smaller than this regions " + "lastReplayedOpenRegionSeqId of "
+ lastReplayedOpenRegionSeqId);
return;
}
// If we were waiting for observing a flush or region opening event for not showing partial
// data after a secondary region crash, we can allow reads now. This event means that the
// primary was not able to flush because memstore is empty when we requested flush. By the
// time we observe this, we are guaranteed to have up to date seqId with our previous
// assignment.
this.setReadsEnabled(true);
}
}
PrepareFlushResult getPrepareFlushResult() {
return prepareFlushResult;
}
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY",
justification = "Intentional; cleared the memstore")
void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException {
checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(),
"RegionEvent marker from WAL ", regionEvent);
startRegionOperation(Operation.REPLAY_EVENT);
try {
if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
return; // if primary nothing to do
}
if (regionEvent.getEventType() == EventType.REGION_CLOSE) {
// nothing to do on REGION_CLOSE for now.
return;
}
if (regionEvent.getEventType() != EventType.REGION_OPEN) {
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Unknown region event received, ignoring :" + TextFormat.shortDebugString(regionEvent));
return;
}
if (LOG.isDebugEnabled()) {
LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying region open event marker "
+ TextFormat.shortDebugString(regionEvent));
}
// we will use writestate as a coarse-grain lock for all the replay events
synchronized (writestate) {
// Replication can deliver events out of order when primary region moves or the region
// server crashes, since there is no coordination between replication of different wal files
// belonging to different region servers. We have to safe guard against this case by using
// region open event's seqid. Since this is the first event that the region puts (after
// possibly flushing recovered.edits), after seeing this event, we can ignore every edit
// smaller than this seqId
if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) {
this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber();
} else {
LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying region event :"
+ TextFormat.shortDebugString(regionEvent)
+ " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
+ " of " + lastReplayedOpenRegionSeqId);
return;
}
// region open lists all the files that the region has at the time of the opening. Just pick
// all the files and drop prepared flushes and empty memstores
for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) {
// stores of primary may be different now
byte[] family = storeDescriptor.getFamilyName().toByteArray();
HStore store = getStore(family);
if (store == null) {
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Received a region open marker from primary, but the family is not found. "
+ "Ignoring. StoreDescriptor:" + storeDescriptor);
continue;
}
long storeSeqId = store.getMaxSequenceId().orElse(0L);
List storeFiles = storeDescriptor.getStoreFileList();
try {
store.refreshStoreFiles(storeFiles); // replace the files with the new ones
} catch (FileNotFoundException ex) {
LOG.warn(getRegionInfo().getEncodedName() + " : " + "At least one of the store files: "
+ storeFiles + " doesn't exist any more. Skip loading the file(s)", ex);
continue;
}
if (store.getMaxSequenceId().orElse(0L) != storeSeqId) {
// Record latest flush time if we picked up new files
lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime());
}
if (writestate.flushing) {
// only drop memstore snapshots if they are smaller than last flush for the store
if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) {
StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null
? null
: this.prepareFlushResult.storeFlushCtxs.get(family);
if (ctx != null) {
MemStoreSize mss = store.getFlushableSize();
ctx.abort();
this.decrMemStoreSize(mss);
this.prepareFlushResult.storeFlushCtxs.remove(family);
}
}
}
// Drop the memstore contents if they are now smaller than the latest seen flushed file
dropMemStoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store);
if (storeSeqId > this.maxFlushedSeqId) {
this.maxFlushedSeqId = storeSeqId;
}
}
// if all stores ended up dropping their snapshots, we can safely drop the
// prepareFlushResult
dropPrepareFlushIfPossible();
// advance the mvcc read point so that the new flushed file is visible.
mvcc.await();
// If we were waiting for observing a flush or region opening event for not showing partial
// data after a secondary region crash, we can allow reads now.
this.setReadsEnabled(true);
// C. Finally notify anyone waiting on memstore to clear:
// e.g. checkResources().
synchronized (this) {
notifyAll(); // FindBugs NN_NAKED_NOTIFY
}
}
logRegionFiles();
} finally {
closeRegionOperation(Operation.REPLAY_EVENT);
}
}
void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException {
checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(),
"BulkLoad marker from WAL ", bulkLoadEvent);
if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
return; // if primary nothing to do
}
if (LOG.isDebugEnabled()) {
LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying bulkload event marker "
+ TextFormat.shortDebugString(bulkLoadEvent));
}
// check if multiple families involved
boolean multipleFamilies = false;
byte[] family = null;
for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
byte[] fam = storeDescriptor.getFamilyName().toByteArray();
if (family == null) {
family = fam;
} else if (!Bytes.equals(family, fam)) {
multipleFamilies = true;
break;
}
}
startBulkRegionOperation(multipleFamilies);
try {
// we will use writestate as a coarse-grain lock for all the replay events
synchronized (writestate) {
// Replication can deliver events out of order when primary region moves or the region
// server crashes, since there is no coordination between replication of different wal files
// belonging to different region servers. We have to safe guard against this case by using
// region open event's seqid. Since this is the first event that the region puts (after
// possibly flushing recovered.edits), after seeing this event, we can ignore every edit
// smaller than this seqId
if (
bulkLoadEvent.getBulkloadSeqNum() >= 0
&& this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum()
) {
LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying bulkload event :"
+ TextFormat.shortDebugString(bulkLoadEvent)
+ " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId"
+ " =" + lastReplayedOpenRegionSeqId);
return;
}
for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
// stores of primary may be different now
family = storeDescriptor.getFamilyName().toByteArray();
HStore store = getStore(family);
if (store == null) {
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ "Received a bulk load marker from primary, but the family is not found. "
+ "Ignoring. StoreDescriptor:" + storeDescriptor);
continue;
}
List storeFiles = storeDescriptor.getStoreFileList();
for (String storeFile : storeFiles) {
StoreFileInfo storeFileInfo = null;
try {
storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile);
store.bulkLoadHFile(storeFileInfo);
} catch (FileNotFoundException ex) {
LOG.warn(getRegionInfo().getEncodedName() + " : "
+ ((storeFileInfo != null)
? storeFileInfo.toString()
: (new Path(Bytes.toString(family), storeFile)).toString())
+ " doesn't exist any more. Skip loading the file");
}
}
}
}
if (bulkLoadEvent.getBulkloadSeqNum() > 0) {
mvcc.advanceTo(bulkLoadEvent.getBulkloadSeqNum());
}
} finally {
closeBulkRegionOperation();
}
}
/**
* If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult
*/
private void dropPrepareFlushIfPossible() {
if (writestate.flushing) {
boolean canDrop = true;
if (prepareFlushResult.storeFlushCtxs != null) {
for (Entry entry : prepareFlushResult.storeFlushCtxs
.entrySet()) {
HStore store = getStore(entry.getKey());
if (store == null) {
continue;
}
if (store.getSnapshotSize().getDataSize() > 0) {
canDrop = false;
break;
}
}
}
// this means that all the stores in the region has finished flushing, but the WAL marker
// may not have been written or we did not receive it yet.
if (canDrop) {
writestate.flushing = false;
this.prepareFlushResult = null;
}
}
}
@Override
public boolean refreshStoreFiles() throws IOException {
return refreshStoreFiles(false);
}
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY",
justification = "Notify is about post replay. Intentional")
protected boolean refreshStoreFiles(boolean force) throws IOException {
if (!force && ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
return false; // if primary nothing to do
}
if (LOG.isDebugEnabled()) {
LOG.debug(getRegionInfo().getEncodedName() + " : "
+ "Refreshing store files to see whether we can free up memstore");
}
long totalFreedDataSize = 0;
long smallestSeqIdInStores = Long.MAX_VALUE;
startRegionOperation(); // obtain region close lock
try {
Map map = new HashMap<>();
synchronized (writestate) {
for (HStore store : stores.values()) {
// TODO: some stores might see new data from flush, while others do not which
// MIGHT break atomic edits across column families.
long maxSeqIdBefore = store.getMaxSequenceId().orElse(0L);
// refresh the store files. This is similar to observing a region open wal marker.
store.refreshStoreFiles();
long storeSeqId = store.getMaxSequenceId().orElse(0L);
if (storeSeqId < smallestSeqIdInStores) {
smallestSeqIdInStores = storeSeqId;
}
// see whether we can drop the memstore or the snapshot
if (storeSeqId > maxSeqIdBefore) {
if (writestate.flushing) {
// only drop memstore snapshots if they are smaller than last flush for the store
if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) {
StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null
? null
: this.prepareFlushResult.storeFlushCtxs
.get(store.getColumnFamilyDescriptor().getName());
if (ctx != null) {
MemStoreSize mss = store.getFlushableSize();
ctx.abort();
this.decrMemStoreSize(mss);
this.prepareFlushResult.storeFlushCtxs
.remove(store.getColumnFamilyDescriptor().getName());
totalFreedDataSize += mss.getDataSize();
}
}
}
map.put(store, storeSeqId);
}
}
// if all stores ended up dropping their snapshots, we can safely drop the
// prepareFlushResult
dropPrepareFlushIfPossible();
// advance the mvcc read point so that the new flushed files are visible.
// either greater than flush seq number or they were already picked up via flush.
for (HStore s : stores.values()) {
mvcc.advanceTo(s.getMaxMemStoreTS().orElse(0L));
}
// smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely
// skip all edits that are to be replayed in the future with that has a smaller seqId
// than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits
// that we have picked the flush files for
if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) {
this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores;
}
}
if (!map.isEmpty()) {
for (Map.Entry entry : map.entrySet()) {
// Drop the memstore contents if they are now smaller than the latest seen flushed file
totalFreedDataSize +=
dropMemStoreContentsForSeqId(entry.getValue(), entry.getKey()).getDataSize();
}
}
// C. Finally notify anyone waiting on memstore to clear:
// e.g. checkResources().
synchronized (this) {
notifyAll(); // FindBugs NN_NAKED_NOTIFY
}
return totalFreedDataSize > 0;
} finally {
closeRegionOperation();
}
}
private void logRegionFiles() {
if (LOG.isTraceEnabled()) {
LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: ");
stores.values().stream().filter(s -> s.getStorefiles() != null)
.flatMap(s -> s.getStorefiles().stream())
.forEachOrdered(sf -> LOG.trace(getRegionInfo().getEncodedName() + " : " + sf));
}
}
/**
* Checks whether the given regionName is either equal to our region, or that the regionName is
* the primary region to our corresponding range for the secondary replica.
*/
private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload)
throws WrongRegionException {
if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) {
return;
}
if (
!RegionReplicaUtil.isDefaultReplica(this.getRegionInfo())
&& Bytes.equals(encodedRegionName, this.fs.getRegionInfoForFS().getEncodedNameAsBytes())
) {
return;
}
throw new WrongRegionException(
exceptionMsg + payload + " targetted for region " + Bytes.toStringBinary(encodedRegionName)
+ " does not match this region: " + this.getRegionInfo());
}
/**
* Used by tests
* @param s Store to add edit too.
* @param cell Cell to add.
*/
protected void restoreEdit(HStore s, Cell cell, MemStoreSizing memstoreAccounting) {
s.add(cell, memstoreAccounting);
}
/**
* make sure have been through lease recovery before get file status, so the file length can be
* trusted.
* @param p File to check.
* @return True if file was zero-length (and if so, we'll delete it in here).
*/
private static boolean isZeroLengthThenDelete(final FileSystem fs, final FileStatus stat,
final Path p) throws IOException {
if (stat.getLen() > 0) {
return false;
}
LOG.warn("File " + p + " is zero-length, deleting.");
fs.delete(p, false);
return true;
}
protected HStore instantiateHStore(final ColumnFamilyDescriptor family, boolean warmup)
throws IOException {
if (family.isMobEnabled()) {
if (HFile.getFormatVersion(this.conf) < HFile.MIN_FORMAT_VERSION_WITH_TAGS) {
throw new IOException("A minimum HFile version of " + HFile.MIN_FORMAT_VERSION_WITH_TAGS
+ " is required for MOB feature. Consider setting " + HFile.FORMAT_VERSION_KEY
+ " accordingly.");
}
return new HMobStore(this, family, this.conf, warmup);
}
return new HStore(this, family, this.conf, warmup);
}
@Override
public HStore getStore(byte[] column) {
return this.stores.get(column);
}
/**
* Return HStore instance. Does not do any copy: as the number of store is limited, we iterate on
* the list.
*/
private HStore getStore(Cell cell) {
return stores.entrySet().stream().filter(e -> CellUtil.matchingFamily(cell, e.getKey()))
.map(e -> e.getValue()).findFirst().orElse(null);
}
@Override
public List getStores() {
return new ArrayList<>(stores.values());
}
@Override
public List getStoreFileList(byte[][] columns) throws IllegalArgumentException {
List storeFileNames = new ArrayList<>();
synchronized (closeLock) {
for (byte[] column : columns) {
HStore store = this.stores.get(column);
if (store == null) {
throw new IllegalArgumentException(
"No column family : " + new String(column, StandardCharsets.UTF_8) + " available");
}
Collection storeFiles = store.getStorefiles();
if (storeFiles == null) {
continue;
}
for (HStoreFile storeFile : storeFiles) {
storeFileNames.add(storeFile.getPath().toString());
}
logRegionFiles();
}
}
return storeFileNames;
}
//////////////////////////////////////////////////////////////////////////////
// Support code
//////////////////////////////////////////////////////////////////////////////
/** Make sure this is a valid row for the HRegion */
void checkRow(byte[] row, String op) throws IOException {
if (!rowIsInRange(getRegionInfo(), row)) {
throw new WrongRegionException("Requested row out of range for " + op + " on HRegion " + this
+ ", startKey='" + Bytes.toStringBinary(getRegionInfo().getStartKey()) + "', getEndKey()='"
+ Bytes.toStringBinary(getRegionInfo().getEndKey()) + "', row='" + Bytes.toStringBinary(row)
+ "'");
}
}
/**
* Get an exclusive ( write lock ) lock on a given row.
* @param row Which row to lock.
* @return A locked RowLock. The lock is exclusive and already aqquired.
*/
public RowLock getRowLock(byte[] row) throws IOException {
return getRowLock(row, false);
}
@Override
public RowLock getRowLock(byte[] row, boolean readLock) throws IOException {
checkRow(row, "row lock");
return getRowLock(row, readLock, null);
}
Span createRegionSpan(String name) {
return TraceUtil.createSpan(name).setAttribute(REGION_NAMES_KEY,
Collections.singletonList(getRegionInfo().getRegionNameAsString()));
}
// will be override in tests
protected RowLock getRowLockInternal(byte[] row, boolean readLock, RowLock prevRowLock)
throws IOException {
// create an object to use a a key in the row lock map
HashedBytes rowKey = new HashedBytes(row);
RowLockContext rowLockContext = null;
RowLockImpl result = null;
boolean success = false;
try {
// Keep trying until we have a lock or error out.
// TODO: do we need to add a time component here?
while (result == null) {
rowLockContext = computeIfAbsent(lockedRows, rowKey, () -> new RowLockContext(rowKey));
// Now try an get the lock.
// This can fail as
if (readLock) {
// For read lock, if the caller has locked the same row previously, it will not try
// to acquire the same read lock. It simply returns the previous row lock.
RowLockImpl prevRowLockImpl = (RowLockImpl) prevRowLock;
if (
(prevRowLockImpl != null)
&& (prevRowLockImpl.getLock() == rowLockContext.readWriteLock.readLock())
) {
success = true;
return prevRowLock;
}
result = rowLockContext.newReadLock();
} else {
result = rowLockContext.newWriteLock();
}
}
int timeout = rowLockWaitDuration;
boolean reachDeadlineFirst = false;
Optional call = RpcServer.getCurrentCall();
if (call.isPresent()) {
long deadline = call.get().getDeadline();
if (deadline < Long.MAX_VALUE) {
int timeToDeadline = (int) (deadline - EnvironmentEdgeManager.currentTime());
if (timeToDeadline <= this.rowLockWaitDuration) {
reachDeadlineFirst = true;
timeout = timeToDeadline;
}
}
}
if (timeout <= 0 || !result.getLock().tryLock(timeout, TimeUnit.MILLISECONDS)) {
String message = "Timed out waiting for lock for row: " + rowKey + " in region "
+ getRegionInfo().getEncodedName();
if (reachDeadlineFirst) {
throw new TimeoutIOException(message);
} else {
// If timeToDeadline is larger than rowLockWaitDuration, we can not drop the request.
throw new IOException(message);
}
}
rowLockContext.setThreadName(Thread.currentThread().getName());
success = true;
return result;
} catch (InterruptedException ie) {
if (LOG.isDebugEnabled()) {
LOG.debug("Thread interrupted waiting for lock on row: {}, in region {}", rowKey,
getRegionInfo().getRegionNameAsString());
}
throw throwOnInterrupt(ie);
} catch (Error error) {
// The maximum lock count for read lock is 64K (hardcoded), when this maximum count
// is reached, it will throw out an Error. This Error needs to be caught so it can
// go ahead to process the minibatch with lock acquired.
LOG.warn("Error to get row lock for {}, in region {}, cause: {}", Bytes.toStringBinary(row),
getRegionInfo().getRegionNameAsString(), error);
IOException ioe = new IOException(error);
throw ioe;
} finally {
// Clean up the counts just in case this was the thing keeping the context alive.
if (!success && rowLockContext != null) {
rowLockContext.cleanUp();
}
}
}
private RowLock getRowLock(byte[] row, boolean readLock, final RowLock prevRowLock)
throws IOException {
return TraceUtil.trace(() -> getRowLockInternal(row, readLock, prevRowLock),
() -> createRegionSpan("Region.getRowLock").setAttribute(ROW_LOCK_READ_LOCK_KEY, readLock));
}
private void releaseRowLocks(List rowLocks) {
if (rowLocks != null) {
for (RowLock rowLock : rowLocks) {
rowLock.release();
}
rowLocks.clear();
}
}
public int getReadLockCount() {
return lock.getReadLockCount();
}
public ConcurrentHashMap getLockedRows() {
return lockedRows;
}
class RowLockContext {
private final HashedBytes row;
final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(true);
final AtomicBoolean usable = new AtomicBoolean(true);
final AtomicInteger count = new AtomicInteger(0);
final Object lock = new Object();
private String threadName;
RowLockContext(HashedBytes row) {
this.row = row;
}
RowLockImpl newWriteLock() {
Lock l = readWriteLock.writeLock();
return getRowLock(l);
}
RowLockImpl newReadLock() {
Lock l = readWriteLock.readLock();
return getRowLock(l);
}
private RowLockImpl getRowLock(Lock l) {
count.incrementAndGet();
synchronized (lock) {
if (usable.get()) {
return new RowLockImpl(this, l);
} else {
return null;
}
}
}
void cleanUp() {
long c = count.decrementAndGet();
if (c <= 0) {
synchronized (lock) {
if (count.get() <= 0 && usable.get()) { // Don't attempt to remove row if already removed
usable.set(false);
RowLockContext removed = lockedRows.remove(row);
assert removed == this : "we should never remove a different context";
}
}
}
}
public void setThreadName(String threadName) {
this.threadName = threadName;
}
@Override
public String toString() {
return "RowLockContext{" + "row=" + row + ", readWriteLock=" + readWriteLock + ", count="
+ count + ", threadName=" + threadName + '}';
}
}
/**
* Class used to represent a lock on a row.
*/
public static class RowLockImpl implements RowLock {
private final RowLockContext context;
private final Lock lock;
public RowLockImpl(RowLockContext context, Lock lock) {
this.context = context;
this.lock = lock;
}
public Lock getLock() {
return lock;
}
public RowLockContext getContext() {
return context;
}
@Override
public void release() {
lock.unlock();
context.cleanUp();
}
@Override
public String toString() {
return "RowLockImpl{" + "context=" + context + ", lock=" + lock + '}';
}
}
/**
* Determines whether multiple column families are present Precondition: familyPaths is not null
* @param familyPaths List of (column family, hfilePath)
*/
private static boolean hasMultipleColumnFamilies(Collection> familyPaths) {
boolean multipleFamilies = false;
byte[] family = null;
for (Pair pair : familyPaths) {
byte[] fam = pair.getFirst();
if (family == null) {
family = fam;
} else if (!Bytes.equals(family, fam)) {
multipleFamilies = true;
break;
}
}
return multipleFamilies;
}
/**
* Attempts to atomically load a group of hfiles. This is critical for loading rows with multiple
* column families atomically.
* @param familyPaths List of Pair<byte[] column family, String hfilePath>
* @param bulkLoadListener Internal hooks enabling massaging/preparation of a file about to be
* bulk loaded
* @return Map from family to List of store file paths if successful, null if failed recoverably
* @throws IOException if failed unrecoverably.
*/
public Map> bulkLoadHFiles(Collection> familyPaths,
boolean assignSeqId, BulkLoadListener bulkLoadListener) throws IOException {
return bulkLoadHFiles(familyPaths, assignSeqId, bulkLoadListener, false, null, true);
}
/**
* Listener class to enable callers of bulkLoadHFile() to perform any necessary pre/post
* processing of a given bulkload call
*/
public interface BulkLoadListener {
/**
* Called before an HFile is actually loaded
* @param family family being loaded to
* @param srcPath path of HFile
* @return final path to be used for actual loading
*/
String prepareBulkLoad(byte[] family, String srcPath, boolean copyFile, String customStaging)
throws IOException;
/**
* Called after a successful HFile load
* @param family family being loaded to
* @param srcPath path of HFile
*/
void doneBulkLoad(byte[] family, String srcPath) throws IOException;
/**
* Called after a failed HFile load
* @param family family being loaded to
* @param srcPath path of HFile
*/
void failedBulkLoad(byte[] family, String srcPath) throws IOException;
}
/**
* Attempts to atomically load a group of hfiles. This is critical for loading rows with multiple
* column families atomically.
* @param familyPaths List of Pair<byte[] column family, String hfilePath>
* @param bulkLoadListener Internal hooks enabling massaging/preparation of a file about to be
* bulk loaded
* @param copyFile always copy hfiles if true
* @param clusterIds ids from clusters that had already handled the given bulkload event.
* @return Map from family to List of store file paths if successful, null if failed recoverably
* @throws IOException if failed unrecoverably.
*/
public Map> bulkLoadHFiles(Collection> familyPaths,
boolean assignSeqId, BulkLoadListener bulkLoadListener, boolean copyFile,
List clusterIds, boolean replicate) throws IOException {
long seqId = -1;
Map> storeFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR);
Map storeFilesSizes = new HashMap<>();
Preconditions.checkNotNull(familyPaths);
// we need writeLock for multi-family bulk load
startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths));
boolean isSuccessful = false;
try {
this.writeRequestsCount.increment();
// There possibly was a split that happened between when the split keys
// were gathered and before the HRegion's write lock was taken. We need
// to validate the HFile region before attempting to bulk load all of them
IOException ioException = null;
List> failures = new ArrayList<>();
for (Pair p : familyPaths) {
byte[] familyName = p.getFirst();
String path = p.getSecond();
HStore store = getStore(familyName);
if (store == null) {
ioException = new org.apache.hadoop.hbase.DoNotRetryIOException(
"No such column family " + Bytes.toStringBinary(familyName));
} else {
try {
store.assertBulkLoadHFileOk(new Path(path));
} catch (WrongRegionException wre) {
// recoverable (file doesn't fit in region)
failures.add(p);
} catch (IOException ioe) {
// unrecoverable (hdfs problem)
ioException = ioe;
}
}
// validation failed because of some sort of IO problem.
if (ioException != null) {
LOG.error("There was IO error when checking if the bulk load is ok in region {}.", this,
ioException);
throw ioException;
}
}
// validation failed, bail out before doing anything permanent.
if (failures.size() != 0) {
StringBuilder list = new StringBuilder();
for (Pair p : failures) {
list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ")
.append(p.getSecond());
}
// problem when validating
LOG.warn("There was a recoverable bulk load failure likely due to a split. These (family,"
+ " HFile) pairs were not loaded: {}, in region {}", list.toString(), this);
return null;
}
// We need to assign a sequential ID that's in between two memstores in order to preserve
// the guarantee that all the edits lower than the highest sequential ID from all the
// HFiles are flushed on disk. See HBASE-10958. The sequence id returned when we flush is
// guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is
// a sequence id that we can be sure is beyond the last hfile written).
if (assignSeqId) {
FlushResult fs = flushcache(true, false, FlushLifeCycleTracker.DUMMY);
if (fs.isFlushSucceeded()) {
seqId = ((FlushResultImpl) fs).flushSequenceId;
} else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
seqId = ((FlushResultImpl) fs).flushSequenceId;
} else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH) {
// CANNOT_FLUSH may mean that a flush is already on-going
// we need to wait for that flush to complete
waitForFlushes();
} else {
throw new IOException("Could not bulk load with an assigned sequential ID because the "
+ "flush didn't run. Reason for not flushing: " + ((FlushResultImpl) fs).failureReason);
}
}
Map>> familyWithFinalPath =
new TreeMap<>(Bytes.BYTES_COMPARATOR);
for (Pair p : familyPaths) {
byte[] familyName = p.getFirst();
String path = p.getSecond();
HStore store = getStore(familyName);
if (!familyWithFinalPath.containsKey(familyName)) {
familyWithFinalPath.put(familyName, new ArrayList<>());
}
List> lst = familyWithFinalPath.get(familyName);
String finalPath = path;
try {
boolean reqTmp = store.storeEngine.requireWritingToTmpDirFirst();
if (bulkLoadListener != null) {
finalPath = bulkLoadListener.prepareBulkLoad(familyName, path, copyFile,
reqTmp ? null : fs.getRegionDir().toString());
}
Pair pair = null;
if (reqTmp || !StoreFileInfo.isHFile(finalPath)) {
pair = store.preBulkLoadHFile(finalPath, seqId);
} else {
Path livePath = new Path(finalPath);
pair = new Pair<>(livePath, livePath);
}
lst.add(pair);
} catch (IOException ioe) {
// A failure here can cause an atomicity violation that we currently
// cannot recover from since it is likely a failed HDFS operation.
LOG.error("There was a partial failure due to IO when attempting to" + " load "
+ Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe);
if (bulkLoadListener != null) {
try {
bulkLoadListener.failedBulkLoad(familyName, finalPath);
} catch (Exception ex) {
LOG.error("Error while calling failedBulkLoad for family "
+ Bytes.toString(familyName) + " with path " + path, ex);
}
}
throw ioe;
}
}
if (this.getCoprocessorHost() != null) {
for (Map.Entry>> entry : familyWithFinalPath.entrySet()) {
this.getCoprocessorHost().preCommitStoreFile(entry.getKey(), entry.getValue());
}
}
for (Map.Entry>> entry : familyWithFinalPath.entrySet()) {
byte[] familyName = entry.getKey();
for (Pair p : entry.getValue()) {
String path = p.getFirst().toString();
Path commitedStoreFile = p.getSecond();
HStore store = getStore(familyName);
try {
store.bulkLoadHFile(familyName, path, commitedStoreFile);
// Note the size of the store file
try {
FileSystem fs = commitedStoreFile.getFileSystem(baseConf);
storeFilesSizes.put(commitedStoreFile.getName(),
fs.getFileStatus(commitedStoreFile).getLen());
} catch (IOException e) {
LOG.warn("Failed to find the size of hfile " + commitedStoreFile, e);
storeFilesSizes.put(commitedStoreFile.getName(), 0L);
}
if (storeFiles.containsKey(familyName)) {
storeFiles.get(familyName).add(commitedStoreFile);
} else {
List storeFileNames = new ArrayList<>();
storeFileNames.add(commitedStoreFile);
storeFiles.put(familyName, storeFileNames);
}
if (bulkLoadListener != null) {
bulkLoadListener.doneBulkLoad(familyName, path);
}
} catch (IOException ioe) {
// A failure here can cause an atomicity violation that we currently
// cannot recover from since it is likely a failed HDFS operation.
// TODO Need a better story for reverting partial failures due to HDFS.
LOG.error("There was a partial failure due to IO when attempting to" + " load "
+ Bytes.toString(familyName) + " : " + p.getSecond(), ioe);
if (bulkLoadListener != null) {
try {
bulkLoadListener.failedBulkLoad(familyName, path);
} catch (Exception ex) {
LOG.error("Error while calling failedBulkLoad for family "
+ Bytes.toString(familyName) + " with path " + path, ex);
}
}
throw ioe;
}
}
}
isSuccessful = true;
if (conf.getBoolean(COMPACTION_AFTER_BULKLOAD_ENABLE, true)) {
// request compaction
familyWithFinalPath.keySet().forEach(family -> {
HStore store = getStore(family);
try {
if (this.rsServices != null && store.needsCompaction()) {
this.rsServices.getCompactionRequestor().requestSystemCompaction(this, store,
"bulkload hfiles request compaction", true);
LOG.info("Request compaction for region {} family {} after bulk load",
this.getRegionInfo().getEncodedName(), store.getColumnFamilyName());
}
} catch (IOException e) {
LOG.error("bulkload hfiles request compaction error ", e);
}
});
}
} finally {
if (wal != null && !storeFiles.isEmpty()) {
// Write a bulk load event for hfiles that are loaded
try {
WALProtos.BulkLoadDescriptor loadDescriptor =
ProtobufUtil.toBulkLoadDescriptor(this.getRegionInfo().getTable(),
UnsafeByteOperations.unsafeWrap(this.getRegionInfo().getEncodedNameAsBytes()),
storeFiles, storeFilesSizes, seqId, clusterIds, replicate);
WALUtil.writeBulkLoadMarkerAndSync(this.wal, this.getReplicationScope(), getRegionInfo(),
loadDescriptor, mvcc);
} catch (IOException ioe) {
if (this.rsServices != null) {
// Have to abort region server because some hfiles has been loaded but we can't write
// the event into WAL
isSuccessful = false;
this.rsServices.abort("Failed to write bulk load event into WAL.", ioe);
}
}
}
closeBulkRegionOperation();
}
return isSuccessful ? storeFiles : null;
}
@Override
public boolean equals(Object o) {
return o instanceof HRegion && Bytes.equals(getRegionInfo().getRegionName(),
((HRegion) o).getRegionInfo().getRegionName());
}
@Override
public int hashCode() {
return Bytes.hashCode(getRegionInfo().getRegionName());
}
@Override
public String toString() {
return getRegionInfo().getRegionNameAsString();
}
// Utility methods
/**
* A utility method to create new instances of HRegion based on the {@link HConstants#REGION_IMPL}
* configuration property.
* @param tableDir qualified path of directory where region should be located, usually the table
* directory.
* @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a
* logfile from the previous execution that's custom-computed for this HRegion.
* The HRegionServer computes and sorts the appropriate wal info for this
* HRegion. If there is a previous file (implying that the HRegion has been
* written-to before), then read it from the supplied path.
* @param fs is the filesystem.
* @param conf is global configuration settings.
* @param regionInfo - RegionInfo that describes the region is new), then read them from the
* supplied path.
* @param htd the table descriptor
* @return the new instance
*/
public static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs, Configuration conf,
RegionInfo regionInfo, final TableDescriptor htd, RegionServerServices rsServices) {
try {
@SuppressWarnings("unchecked")
Class extends HRegion> regionClass =
(Class extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
Constructor extends HRegion> c =
regionClass.getConstructor(Path.class, WAL.class, FileSystem.class, Configuration.class,
RegionInfo.class, TableDescriptor.class, RegionServerServices.class);
return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices);
} catch (Throwable e) {
// todo: what should I throw here?
throw new IllegalStateException("Could not instantiate a region instance.", e);
}
}
/**
* Convenience method creating new HRegions. Used by createTable.
* @param info Info for region to create.
* @param rootDir Root directory for HBase instance
* @param wal shared WAL
* @param initialize - true to initialize the region
* @return new HRegion
*/
public static HRegion createHRegion(final RegionInfo info, final Path rootDir,
final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal,
final boolean initialize) throws IOException {
return createHRegion(info, rootDir, conf, hTableDescriptor, wal, initialize, null);
}
/**
* Convenience method creating new HRegions. Used by createTable.
* @param info Info for region to create.
* @param rootDir Root directory for HBase instance
* @param wal shared WAL
* @param initialize - true to initialize the region
* @param rsRpcServices An interface we can request flushes against.
* @return new HRegion
*/
public static HRegion createHRegion(final RegionInfo info, final Path rootDir,
final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal,
final boolean initialize, RegionServerServices rsRpcServices) throws IOException {
LOG.info("creating " + info + ", tableDescriptor="
+ (hTableDescriptor == null ? "null" : hTableDescriptor) + ", regionDir=" + rootDir);
createRegionDir(conf, info, rootDir);
FileSystem fs = rootDir.getFileSystem(conf);
Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable());
HRegion region =
HRegion.newHRegion(tableDir, wal, fs, conf, info, hTableDescriptor, rsRpcServices);
if (initialize) {
region.initialize(null);
}
return region;
}
/**
* Create a region under the given table directory.
*/
public static HRegion createHRegion(Configuration conf, RegionInfo regionInfo, FileSystem fs,
Path tableDir, TableDescriptor tableDesc) throws IOException {
LOG.info("Creating {}, tableDescriptor={}, under table dir {}", regionInfo, tableDesc,
tableDir);
HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, regionInfo);
HRegion region = HRegion.newHRegion(tableDir, null, fs, conf, regionInfo, tableDesc, null);
return region;
}
/**
* Create the region directory in the filesystem.
*/
public static HRegionFileSystem createRegionDir(Configuration configuration, RegionInfo ri,
Path rootDir) throws IOException {
FileSystem fs = rootDir.getFileSystem(configuration);
Path tableDir = CommonFSUtils.getTableDir(rootDir, ri.getTable());
// If directory already exists, will log warning and keep going. Will try to create
// .regioninfo. If one exists, will overwrite.
return HRegionFileSystem.createRegionOnFileSystem(configuration, fs, tableDir, ri);
}
public static HRegion createHRegion(final RegionInfo info, final Path rootDir,
final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal)
throws IOException {
return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true);
}
/**
* Open a Region.
* @param info Info for region to be opened.
* @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing
* the result of the call to HRegion#getMinSequenceId() to ensure the wal id is
* properly kept up. HRegionStore does this every time it opens a new region.
* @return new HRegion
*/
public static HRegion openHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal,
final Configuration conf) throws IOException {
return openHRegion(info, htd, wal, conf, null, null);
}
/**
* Open a Region.
* @param info Info for region to be opened
* @param htd the table descriptor
* @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long)
* passing the result of the call to HRegion#getMinSequenceId() to ensure the
* wal id is properly kept up. HRegionStore does this every time it opens a new
* region.
* @param conf The Configuration object to use.
* @param rsServices An interface we can request flushes against.
* @param reporter An interface we can report progress against.
* @return new HRegion
*/
public static HRegion openHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal,
final Configuration conf, final RegionServerServices rsServices,
final CancelableProgressable reporter) throws IOException {
return openHRegion(CommonFSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter);
}
/**
* Open a Region.
* @param rootDir Root directory for HBase instance
* @param info Info for region to be opened.
* @param htd the table descriptor
* @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing
* the result of the call to HRegion#getMinSequenceId() to ensure the wal id is
* properly kept up. HRegionStore does this every time it opens a new region.
* @param conf The Configuration object to use.
* @return new HRegion
*/
public static HRegion openHRegion(Path rootDir, final RegionInfo info, final TableDescriptor htd,
final WAL wal, final Configuration conf) throws IOException {
return openHRegion(rootDir, info, htd, wal, conf, null, null);
}
/**
* Open a Region.
* @param rootDir Root directory for HBase instance
* @param info Info for region to be opened.
* @param htd the table descriptor
* @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long)
* passing the result of the call to HRegion#getMinSequenceId() to ensure the
* wal id is properly kept up. HRegionStore does this every time it opens a new
* region.
* @param conf The Configuration object to use.
* @param rsServices An interface we can request flushes against.
* @param reporter An interface we can report progress against.
* @return new HRegion
*/
public static HRegion openHRegion(final Path rootDir, final RegionInfo info,
final TableDescriptor htd, final WAL wal, final Configuration conf,
final RegionServerServices rsServices, final CancelableProgressable reporter)
throws IOException {
FileSystem fs = null;
if (rsServices != null) {
fs = rsServices.getFileSystem();
}
if (fs == null) {
fs = rootDir.getFileSystem(conf);
}
return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter);
}
/**
* Open a Region.
* @param conf The Configuration object to use.
* @param fs Filesystem to use
* @param rootDir Root directory for HBase instance
* @param info Info for region to be opened.
* @param htd the table descriptor
* @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing
* the result of the call to HRegion#getMinSequenceId() to ensure the wal id is
* properly kept up. HRegionStore does this every time it opens a new region.
* @return new HRegion
*/
public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal)
throws IOException {
return openHRegion(conf, fs, rootDir, info, htd, wal, null, null);
}
/**
* Open a Region.
* @param conf The Configuration object to use.
* @param fs Filesystem to use
* @param rootDir Root directory for HBase instance
* @param info Info for region to be opened.
* @param htd the table descriptor
* @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long)
* passing the result of the call to HRegion#getMinSequenceId() to ensure the
* wal id is properly kept up. HRegionStore does this every time it opens a new
* region.
* @param rsServices An interface we can request flushes against.
* @param reporter An interface we can report progress against.
* @return new HRegion
*/
public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal,
final RegionServerServices rsServices, final CancelableProgressable reporter)
throws IOException {
Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable());
return openHRegionFromTableDir(conf, fs, tableDir, info, htd, wal, rsServices, reporter);
}
/**
* Open a Region.
* @param conf The Configuration object to use.
* @param fs Filesystem to use
* @param info Info for region to be opened.
* @param htd the table descriptor
* @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long)
* passing the result of the call to HRegion#getMinSequenceId() to ensure the
* wal id is properly kept up. HRegionStore does this every time it opens a new
* region.
* @param rsServices An interface we can request flushes against.
* @param reporter An interface we can report progress against.
* @return new HRegion
*/
public static HRegion openHRegionFromTableDir(final Configuration conf, final FileSystem fs,
final Path tableDir, final RegionInfo info, final TableDescriptor htd, final WAL wal,
final RegionServerServices rsServices, final CancelableProgressable reporter)
throws IOException {
Objects.requireNonNull(info, "RegionInfo cannot be null");
LOG.debug("Opening region: {}", info);
HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
return r.openHRegion(reporter);
}
public NavigableMap getReplicationScope() {
return this.replicationScope;
}
/**
* Useful when reopening a closed region (normally for unit tests)
* @param other original object
* @param reporter An interface we can report progress against.
* @return new HRegion
*/
public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter)
throws IOException {
HRegionFileSystem regionFs = other.getRegionFileSystem();
HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(),
other.baseConf, other.getRegionInfo(), other.getTableDescriptor(), null);
return r.openHRegion(reporter);
}
public static Region openHRegion(final Region other, final CancelableProgressable reporter)
throws IOException {
return openHRegion((HRegion) other, reporter);
}
/**
* Open HRegion.
*
* Calls initialize and sets sequenceId.
* @return Returns this
*/
private HRegion openHRegion(final CancelableProgressable reporter) throws IOException {
try {
CompoundConfiguration cConfig =
new CompoundConfiguration().add(conf).addBytesMap(htableDescriptor.getValues());
// Refuse to open the region if we are missing local compression support
TableDescriptorChecker.checkCompression(cConfig, htableDescriptor);
// Refuse to open the region if encryption configuration is incorrect or
// codec support is missing
LOG.debug("checking encryption for " + this.getRegionInfo().getEncodedName());
TableDescriptorChecker.checkEncryption(cConfig, htableDescriptor);
// Refuse to open the region if a required class cannot be loaded
LOG.debug("checking classloading for " + this.getRegionInfo().getEncodedName());
TableDescriptorChecker.checkClassLoading(cConfig, htableDescriptor);
this.openSeqNum = initialize(reporter);
this.mvcc.advanceTo(openSeqNum);
// The openSeqNum must be increased every time when a region is assigned, as we rely on it to
// determine whether a region has been successfully reopened. So here we always write open
// marker, even if the table is read only.
if (
wal != null && getRegionServerServices() != null
&& RegionReplicaUtil.isDefaultReplica(getRegionInfo())
) {
writeRegionOpenMarker(wal, openSeqNum);
}
} catch (Throwable t) {
// By coprocessor path wrong region will open failed,
// MetricsRegionWrapperImpl is already init and not close,
// add region close when open failed
try {
// It is not required to write sequence id file when region open is failed.
// Passing true to skip the sequence id file write.
this.close(true);
} catch (Throwable e) {
LOG.warn("Open region: {} failed. Try close region but got exception ",
this.getRegionInfo(), e);
}
throw t;
}
return this;
}
/**
* Open a Region on a read-only file-system (like hdfs snapshots)
* @param conf The Configuration object to use.
* @param fs Filesystem to use
* @param info Info for region to be opened.
* @param htd the table descriptor
* @return new HRegion
*/
public static HRegion openReadOnlyFileSystemHRegion(final Configuration conf, final FileSystem fs,
final Path tableDir, RegionInfo info, final TableDescriptor htd) throws IOException {
if (info == null) {
throw new NullPointerException("Passed region info is null");
}
if (LOG.isDebugEnabled()) {
LOG.debug("Opening region (readOnly filesystem): " + info);
}
if (info.getReplicaId() <= 0) {
info = RegionInfoBuilder.newBuilder(info).setReplicaId(1).build();
}
HRegion r = HRegion.newHRegion(tableDir, null, fs, conf, info, htd, null);
r.writestate.setReadOnly(true);
return r.openHRegion(null);
}
public static HRegion warmupHRegion(final RegionInfo info, final TableDescriptor htd,
final WAL wal, final Configuration conf, final RegionServerServices rsServices,
final CancelableProgressable reporter) throws IOException {
Objects.requireNonNull(info, "RegionInfo cannot be null");
LOG.debug("Warmup {}", info);
Path rootDir = CommonFSUtils.getRootDir(conf);
Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable());
FileSystem fs = null;
if (rsServices != null) {
fs = rsServices.getFileSystem();
}
if (fs == null) {
fs = rootDir.getFileSystem(conf);
}
HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, null);
r.initializeWarmup(reporter);
r.close();
return r;
}
/**
* Computes the Path of the HRegion
* @param tabledir qualified path for table
* @param name ENCODED region name
* @return Path of HRegion directory
* @deprecated For tests only; to be removed.
*/
@Deprecated
public static Path getRegionDir(final Path tabledir, final String name) {
return new Path(tabledir, name);
}
/**
* Determines if the specified row is within the row range specified by the specified RegionInfo
* @param info RegionInfo that specifies the row range
* @param row row to be checked
* @return true if the row is within the range specified by the RegionInfo
*/
public static boolean rowIsInRange(RegionInfo info, final byte[] row) {
return ((info.getStartKey().length == 0) || (Bytes.compareTo(info.getStartKey(), row) <= 0))
&& ((info.getEndKey().length == 0) || (Bytes.compareTo(info.getEndKey(), row) > 0));
}
public static boolean rowIsInRange(RegionInfo info, final byte[] row, final int offset,
final short length) {
return ((info.getStartKey().length == 0)
|| (Bytes.compareTo(info.getStartKey(), 0, info.getStartKey().length, row, offset, length)
<= 0))
&& ((info.getEndKey().length == 0)
|| (Bytes.compareTo(info.getEndKey(), 0, info.getEndKey().length, row, offset, length)
> 0));
}
@Override
public Result get(final Get get) throws IOException {
prepareGet(get);
List results = get(get, true);
boolean stale = this.getRegionInfo().getReplicaId() != 0;
return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale);
}
void prepareGet(final Get get) throws IOException {
checkRow(get.getRow(), "Get");
// Verify families are all valid
if (get.hasFamilies()) {
for (byte[] family : get.familySet()) {
checkFamily(family);
}
} else { // Adding all families to scanner
for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) {
get.addFamily(family);
}
}
}
@Override
public List get(Get get, boolean withCoprocessor) throws IOException {
return get(get, withCoprocessor, HConstants.NO_NONCE, HConstants.NO_NONCE);
}
private List get(Get get, boolean withCoprocessor, long nonceGroup, long nonce)
throws IOException {
return TraceUtil.trace(() -> getInternal(get, withCoprocessor, nonceGroup, nonce),
() -> createRegionSpan("Region.get"));
}
private List getInternal(Get get, boolean withCoprocessor, long nonceGroup, long nonce)
throws IOException {
List results = new ArrayList<>();
long before = EnvironmentEdgeManager.currentTime();
// pre-get CP hook
if (withCoprocessor && (coprocessorHost != null)) {
if (coprocessorHost.preGet(get, results)) {
metricsUpdateForGet(results, before);
return results;
}
}
Scan scan = new Scan(get);
if (scan.getLoadColumnFamiliesOnDemandValue() == null) {
scan.setLoadColumnFamiliesOnDemand(isLoadingCfsOnDemandDefault());
}
try (RegionScanner scanner = getScanner(scan, null, nonceGroup, nonce)) {
List tmp = new ArrayList<>();
scanner.next(tmp);
// Copy EC to heap, then close the scanner.
// This can be an EXPENSIVE call. It may make an extra copy from offheap to onheap buffers.
// See more details in HBASE-26036.
for (Cell cell : tmp) {
results.add(CellUtil.cloneIfNecessary(cell));
}
}
// post-get CP hook
if (withCoprocessor && (coprocessorHost != null)) {
coprocessorHost.postGet(get, results);
}
metricsUpdateForGet(results, before);
return results;
}
void metricsUpdateForGet(List results, long before) {
if (this.metricsRegion != null) {
this.metricsRegion.updateGet(EnvironmentEdgeManager.currentTime() - before);
}
if (rsServices != null && this.rsServices.getMetrics() != null) {
rsServices.getMetrics().updateReadQueryMeter(getTableDescriptor().getTableName(), 1);
}
}
@Override
public Result mutateRow(RowMutations rm) throws IOException {
return mutateRow(rm, HConstants.NO_NONCE, HConstants.NO_NONCE);
}
public Result mutateRow(RowMutations rm, long nonceGroup, long nonce) throws IOException {
final List m = rm.getMutations();
OperationStatus[] statuses = batchMutate(m.toArray(new Mutation[0]), true, nonceGroup, nonce);
List results = new ArrayList<>();
for (OperationStatus status : statuses) {
if (status.getResult() != null) {
results.add(status.getResult());
}
}
if (results.isEmpty()) {
return null;
}
// Merge the results of the Increment/Append operations
List cells = new ArrayList<>();
for (Result result : results) {
if (result.rawCells() != null) {
cells.addAll(Arrays.asList(result.rawCells()));
}
}
return Result.create(cells);
}
/**
* Perform atomic (all or none) mutations within the region.
* @param mutations The list of mutations to perform. mutations can contain
* operations for multiple rows. Caller has to ensure that all rows are
* contained in this region.
* @param rowsToLock Rows to lock
* @param nonceGroup Optional nonce group of the operation (client Id)
* @param nonce Optional nonce of the operation (unique random id to ensure "more
* idempotence") If multiple rows are locked care should be taken that
* rowsToLock is sorted in order to avoid deadlocks.
*/
@Override
public void mutateRowsWithLocks(Collection mutations, Collection rowsToLock,
long nonceGroup, long nonce) throws IOException {
batchMutate(new MutationBatchOperation(this, mutations.toArray(new Mutation[mutations.size()]),
true, nonceGroup, nonce) {
@Override
public MiniBatchOperationInProgress
lockRowsAndBuildMiniBatch(List acquiredRowLocks) throws IOException {
RowLock prevRowLock = null;
for (byte[] row : rowsToLock) {
try {
RowLock rowLock = region.getRowLock(row, false, prevRowLock); // write lock
if (rowLock != prevRowLock) {
acquiredRowLocks.add(rowLock);
prevRowLock = rowLock;
}
} catch (IOException ioe) {
LOG.warn("Failed getting lock, row={}, in region {}", Bytes.toStringBinary(row), this,
ioe);
throw ioe;
}
}
return createMiniBatch(size(), size());
}
});
}
/** Returns statistics about the current load of the region */
public ClientProtos.RegionLoadStats getLoadStatistics() {
if (!regionStatsEnabled) {
return null;
}
ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder();
stats.setMemStoreLoad((int) (Math.min(100,
(this.memStoreSizing.getMemStoreSize().getHeapSize() * 100) / this.memstoreFlushSize)));
if (rsServices.getHeapMemoryManager() != null) {
// the HeapMemoryManager uses -0.0 to signal a problem asking the JVM,
// so we could just do the calculation below and we'll get a 0.
// treating it as a special case analogous to no HMM instead so that it can be
// programatically treated different from using <1% of heap.
final float occupancy = rsServices.getHeapMemoryManager().getHeapOccupancyPercent();
if (occupancy != HeapMemoryManager.HEAP_OCCUPANCY_ERROR_VALUE) {
stats.setHeapOccupancy((int) (occupancy * 100));
}
}
stats.setCompactionPressure((int) (rsServices.getCompactionPressure() * 100 > 100
? 100
: rsServices.getCompactionPressure() * 100));
return stats.build();
}
@Override
public void processRowsWithLocks(RowProcessor, ?> processor) throws IOException {
processRowsWithLocks(processor, rowProcessorTimeout, HConstants.NO_NONCE, HConstants.NO_NONCE);
}
@Override
public void processRowsWithLocks(RowProcessor, ?> processor, long nonceGroup, long nonce)
throws IOException {
processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce);
}
@Override
public void processRowsWithLocks(RowProcessor, ?> processor, long timeout, long nonceGroup,
long nonce) throws IOException {
for (byte[] row : processor.getRowsToLock()) {
checkRow(row, "processRowsWithLocks");
}
if (!processor.readOnly()) {
checkReadOnly();
}
checkResources();
startRegionOperation();
WALEdit walEdit = new WALEdit();
// STEP 1. Run pre-process hook
preProcess(processor, walEdit);
// Short circuit the read only case
if (processor.readOnly()) {
try {
long now = EnvironmentEdgeManager.currentTime();
doProcessRowWithTimeout(processor, now, this, null, null, timeout);
processor.postProcess(this, walEdit, true);
} finally {
closeRegionOperation();
}
return;
}
boolean locked = false;
List acquiredRowLocks = null;
List mutations = new ArrayList<>();
Collection rowsToLock = processor.getRowsToLock();
// This is assigned by mvcc either explicity in the below or in the guts of the WAL append
// when it assigns the edit a sequencedid (A.K.A the mvcc write number).
WriteEntry writeEntry = null;
MemStoreSizing memstoreAccounting = new NonThreadSafeMemStoreSizing();
// Check for thread interrupt status in case we have been signaled from
// #interruptRegionOperation.
checkInterrupt();
try {
boolean success = false;
try {
// STEP 2. Acquire the row lock(s)
acquiredRowLocks = new ArrayList<>(rowsToLock.size());
RowLock prevRowLock = null;
for (byte[] row : rowsToLock) {
// Attempt to lock all involved rows, throw if any lock times out
// use a writer lock for mixed reads and writes
RowLock rowLock = getRowLockInternal(row, false, prevRowLock);
if (rowLock != prevRowLock) {
acquiredRowLocks.add(rowLock);
prevRowLock = rowLock;
}
}
// Check for thread interrupt status in case we have been signaled from
// #interruptRegionOperation. Do it before we take the lock and disable interrupts for
// the WAL append.
checkInterrupt();
// STEP 3. Region lock
lock(this.updatesLock.readLock(), acquiredRowLocks.isEmpty() ? 1 : acquiredRowLocks.size());
locked = true;
// From this point until memstore update this operation should not be interrupted.
disableInterrupts();
long now = EnvironmentEdgeManager.currentTime();
// STEP 4. Let the processor scan the rows, generate mutations and add waledits
doProcessRowWithTimeout(processor, now, this, mutations, walEdit, timeout);
if (!mutations.isEmpty()) {
writeRequestsCount.add(mutations.size());
// STEP 5. Call the preBatchMutate hook
processor.preBatchMutate(this, walEdit);
// STEP 6. Append and sync if walEdit has data to write out.
if (!walEdit.isEmpty()) {
writeEntry = doWALAppend(walEdit, getEffectiveDurability(processor.useDurability()),
processor.getClusterIds(), now, nonceGroup, nonce);
} else {
// We are here if WAL is being skipped.
writeEntry = this.mvcc.begin();
}
// STEP 7. Apply to memstore
long sequenceId = writeEntry.getWriteNumber();
for (Mutation m : mutations) {
// Handle any tag based cell features.
// TODO: Do we need to call rewriteCellTags down in applyToMemStore()? Why not before
// so tags go into WAL?
rewriteCellTags(m.getFamilyCellMap(), m);
for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
Cell cell = cellScanner.current();
if (walEdit.isEmpty()) {
// If walEdit is empty, we put nothing in WAL. WAL stamps Cells with sequence id.
// If no WAL, need to stamp it here.
PrivateCellUtil.setSequenceId(cell, sequenceId);
}
applyToMemStore(getStore(cell), cell, memstoreAccounting);
}
}
// STEP 8. call postBatchMutate hook
processor.postBatchMutate(this);
// STEP 9. Complete mvcc.
mvcc.completeAndWait(writeEntry);
writeEntry = null;
// STEP 10. Release region lock
if (locked) {
this.updatesLock.readLock().unlock();
locked = false;
}
// STEP 11. Release row lock(s)
releaseRowLocks(acquiredRowLocks);
if (rsServices != null && rsServices.getMetrics() != null) {
rsServices.getMetrics().updateWriteQueryMeter(this.htableDescriptor.getTableName(),
mutations.size());
}
}
success = true;
} finally {
// Call complete rather than completeAndWait because we probably had error if walKey != null
if (writeEntry != null) mvcc.complete(writeEntry);
if (locked) {
this.updatesLock.readLock().unlock();
}
// release locks if some were acquired but another timed out
releaseRowLocks(acquiredRowLocks);
enableInterrupts();
}
// 12. Run post-process hook
processor.postProcess(this, walEdit, success);
} finally {
closeRegionOperation();
if (!mutations.isEmpty()) {
this.incMemStoreSize(memstoreAccounting.getMemStoreSize());
requestFlushIfNeeded();
}
}
}
private void preProcess(final RowProcessor, ?> processor, final WALEdit walEdit)
throws IOException {
try {
processor.preProcess(this, walEdit);
} catch (IOException e) {
closeRegionOperation();
throw e;
}
}
private void doProcessRowWithTimeout(final RowProcessor, ?> processor, final long now,
final HRegion region, final List mutations, final WALEdit walEdit, final long timeout)
throws IOException {
// Short circuit the no time bound case.
if (timeout < 0) {
try {
processor.process(now, region, mutations, walEdit);
} catch (IOException e) {
String row = processor.getRowsToLock().isEmpty()
? ""
: " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next())
+ "...";
LOG.warn("RowProcessor: {}, in region {}, throws Exception {}",
processor.getClass().getName(), getRegionInfo().getRegionNameAsString(), row, e);
throw e;
}
return;
}
// Case with time bound
FutureTask task = new FutureTask<>(new Callable() {
@Override
public Void call() throws IOException {
try {
processor.process(now, region, mutations, walEdit);
return null;
} catch (IOException e) {
String row = processor.getRowsToLock().isEmpty()
? ""
: " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next())
+ "...";
LOG.warn("RowProcessor: {}, in region {}, throws Exception {}",
processor.getClass().getName(), getRegionInfo().getRegionNameAsString(), row, e);
throw e;
}
}
});
rowProcessorExecutor.execute(task);
try {
task.get(timeout, TimeUnit.MILLISECONDS);
} catch (InterruptedException ie) {
throw throwOnInterrupt(ie);
} catch (TimeoutException te) {
String row = processor.getRowsToLock().isEmpty()
? ""
: " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "...";
LOG.error("RowProcessor timeout: {} ms, in region {}, {}", timeout,
getRegionInfo().getRegionNameAsString(), row);
throw new IOException(te);
} catch (Exception e) {
throw new IOException(e);
}
}
@Override
public Result append(Append append) throws IOException {
return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE);
}
public Result append(Append append, long nonceGroup, long nonce) throws IOException {
return TraceUtil.trace(() -> {
checkReadOnly();
checkResources();
startRegionOperation(Operation.APPEND);
try {
// All edits for the given row (across all column families) must happen atomically.
return mutate(append, true, nonceGroup, nonce).getResult();
} finally {
closeRegionOperation(Operation.APPEND);
}
}, () -> createRegionSpan("Region.append"));
}
@Override
public Result increment(Increment increment) throws IOException {
return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE);
}
public Result increment(Increment increment, long nonceGroup, long nonce) throws IOException {
return TraceUtil.trace(() -> {
checkReadOnly();
checkResources();
startRegionOperation(Operation.INCREMENT);
try {
// All edits for the given row (across all column families) must happen atomically.
return mutate(increment, true, nonceGroup, nonce).getResult();
} finally {
closeRegionOperation(Operation.INCREMENT);
}
}, () -> createRegionSpan("Region.increment"));
}
private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List clusterIds,
long now, long nonceGroup, long nonce) throws IOException {
return doWALAppend(walEdit, durability, clusterIds, now, nonceGroup, nonce,
SequenceId.NO_SEQUENCE_ID);
}
/** Returns writeEntry associated with this append */
private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List clusterIds,
long now, long nonceGroup, long nonce, long origLogSeqNum) throws IOException {
Preconditions.checkArgument(walEdit != null && !walEdit.isEmpty(), "WALEdit is null or empty!");
Preconditions.checkArgument(!walEdit.isReplay() || origLogSeqNum != SequenceId.NO_SEQUENCE_ID,
"Invalid replay sequence Id for replay WALEdit!");
// Using default cluster id, as this can only happen in the originating cluster.
// A slave cluster receives the final value (not the delta) as a Put. We use HLogKey
// here instead of WALKeyImpl directly to support legacy coprocessors.
WALKeyImpl walKey = walEdit.isReplay()
? new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(),
this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds,
nonceGroup, nonce, mvcc)
: new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(),
this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds,
nonceGroup, nonce, mvcc, this.getReplicationScope());
if (walEdit.isReplay()) {
walKey.setOrigLogSeqNum(origLogSeqNum);
}
// don't call the coproc hook for writes to the WAL caused by
// system lifecycle events like flushes or compactions
if (this.coprocessorHost != null && !walEdit.isMetaEdit()) {
this.coprocessorHost.preWALAppend(walKey, walEdit);
}
WriteEntry writeEntry = null;
try {
long txid = this.wal.appendData(this.getRegionInfo(), walKey, walEdit);
// Call sync on our edit.
if (txid != 0) {
sync(txid, durability);
}
writeEntry = walKey.getWriteEntry();
} catch (IOException ioe) {
if (walKey != null && walKey.getWriteEntry() != null) {
mvcc.complete(walKey.getWriteEntry());
}
throw ioe;
}
return writeEntry;
}
//
// New HBASE-880 Helpers
//
void checkFamily(final byte[] family) throws NoSuchColumnFamilyException {
if (!this.htableDescriptor.hasColumnFamily(family)) {
throw new NoSuchColumnFamilyException("Column family " + Bytes.toString(family)
+ " does not exist in region " + this + " in table " + this.htableDescriptor);
}
}
public static final long FIXED_OVERHEAD = ClassSize.estimateBase(HRegion.class, false);
// woefully out of date - currently missing:
// 1 x HashMap - coprocessorServiceHandlers
// 6 x LongAdder - numMutationsWithoutWAL, dataInMemoryWithoutWAL,
// checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount,
// writeRequestsCount
// 1 x HRegion$WriteState - writestate
// 1 x RegionCoprocessorHost - coprocessorHost
// 1 x RegionSplitPolicy - splitPolicy
// 1 x MetricsRegion - metricsRegion
// 1 x MetricsRegionWrapperImpl - metricsRegionWrapper
public static final long DEEP_OVERHEAD = FIXED_OVERHEAD + ClassSize.OBJECT + // closeLock
(2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
(3 * ClassSize.ATOMIC_LONG) + // numPutsWithoutWAL, dataInMemoryWithoutWAL,
// compactionsFailed
(3 * ClassSize.CONCURRENT_HASHMAP) + // lockedRows, scannerReadPoints, regionLockHolders
WriteState.HEAP_SIZE + // writestate
ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores
(2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock
MultiVersionConcurrencyControl.FIXED_SIZE // mvcc
+ 2 * ClassSize.TREEMAP // maxSeqIdInStores, replicationScopes
+ 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress
+ ClassSize.STORE_SERVICES // store services
+ StoreHotnessProtector.FIXED_SIZE;
@Override
public long heapSize() {
// this does not take into account row locks, recent flushes, mvcc entries, and more
return DEEP_OVERHEAD + stores.values().stream().mapToLong(HStore::heapSize).sum();
}
/**
* Registers a new protocol buffer {@link Service} subclass as a coprocessor endpoint to be
* available for handling Region#execService(com.google.protobuf.RpcController,
* org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall) calls.
*
* Only a single instance may be registered per region for a given {@link Service} subclass (the
* instances are keyed on {@link com.google.protobuf.Descriptors.ServiceDescriptor#getFullName()}.
* After the first registration, subsequent calls with the same service name will fail with a
* return value of {@code false}.
*
* @param instance the {@code Service} subclass instance to expose as a coprocessor endpoint
* @return {@code true} if the registration was successful, {@code false} otherwise
*/
public boolean registerService(com.google.protobuf.Service instance) {
/*
* No stacking of instances is allowed for a single service name
*/
com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
String serviceName = CoprocessorRpcUtils.getServiceName(serviceDesc);
if (coprocessorServiceHandlers.containsKey(serviceName)) {
LOG.error("Coprocessor service {} already registered, rejecting request from {} in region {}",
serviceName, instance, this);
return false;
}
coprocessorServiceHandlers.put(serviceName, instance);
if (LOG.isDebugEnabled()) {
LOG.debug("Registered coprocessor service: region="
+ Bytes.toStringBinary(getRegionInfo().getRegionName()) + " service=" + serviceName);
}
return true;
}
/**
* Executes a single protocol buffer coprocessor endpoint {@link Service} method using the
* registered protocol handlers. {@link Service} implementations must be registered via the
* {@link #registerService(com.google.protobuf.Service)} method before they are available.
* @param controller an {@code RpcContoller} implementation to pass to the invoked service
* @param call a {@code CoprocessorServiceCall} instance identifying the service, method,
* and parameters for the method invocation
* @return a protocol buffer {@code Message} instance containing the method's result
* @throws IOException if no registered service handler is found or an error occurs during the
* invocation
* @see #registerService(com.google.protobuf.Service)
*/
public com.google.protobuf.Message execService(com.google.protobuf.RpcController controller,
CoprocessorServiceCall call) throws IOException {
String serviceName = call.getServiceName();
com.google.protobuf.Service service = coprocessorServiceHandlers.get(serviceName);
if (service == null) {
throw new UnknownProtocolException(null, "No registered coprocessor service found for "
+ serviceName + " in region " + Bytes.toStringBinary(getRegionInfo().getRegionName()));
}
com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
String methodName = call.getMethodName();
com.google.protobuf.Descriptors.MethodDescriptor methodDesc =
CoprocessorRpcUtils.getMethodDescriptor(methodName, serviceDesc);
com.google.protobuf.Message.Builder builder =
service.getRequestPrototype(methodDesc).newBuilderForType();
org.apache.hadoop.hbase.protobuf.ProtobufUtil.mergeFrom(builder,
call.getRequest().toByteArray());
com.google.protobuf.Message request =
CoprocessorRpcUtils.getRequest(service, methodDesc, call.getRequest());
if (coprocessorHost != null) {
request = coprocessorHost.preEndpointInvocation(service, methodName, request);
}
final com.google.protobuf.Message.Builder responseBuilder =
service.getResponsePrototype(methodDesc).newBuilderForType();
service.callMethod(methodDesc, controller, request,
new com.google.protobuf.RpcCallback() {
@Override
public void run(com.google.protobuf.Message message) {
if (message != null) {
responseBuilder.mergeFrom(message);
}
}
});
if (coprocessorHost != null) {
coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder);
}
IOException exception =
org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils.getControllerException(controller);
if (exception != null) {
throw exception;
}
return responseBuilder.build();
}
public Optional checkSplit() {
return checkSplit(false);
}
/**
* Return the split point. An empty result indicates the region isn't splittable.
*/
public Optional checkSplit(boolean force) {
// Can't split META
if (
this.getRegionInfo().isMetaRegion()
|| TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable())
) {
return Optional.empty();
}
// Can't split a region that is closing.
if (this.isClosing()) {
return Optional.empty();
}
if (!force && !splitPolicy.shouldSplit()) {
return Optional.empty();
}
byte[] ret = splitPolicy.getSplitPoint();
if (ret != null && ret.length > 0) {
ret = splitRestriction.getRestrictedSplitPoint(ret);
}
if (ret != null) {
try {
checkRow(ret, "calculated split");
} catch (IOException e) {
LOG.error("Ignoring invalid split for region {}", this, e);
return Optional.empty();
}
return Optional.of(ret);
} else {
return Optional.empty();
}
}
/** Returns The priority that this region should have in the compaction queue */
public int getCompactPriority() {
if (conf.getBoolean(SPLIT_IGNORE_BLOCKING_ENABLED_KEY, false) && checkSplit().isPresent()) {
// if a region should split, split it before compact
return Store.PRIORITY_USER;
}
return stores.values().stream().mapToInt(HStore::getCompactPriority).min()
.orElse(Store.NO_PRIORITY);
}
/** Returns the coprocessor host */
public RegionCoprocessorHost getCoprocessorHost() {
return coprocessorHost;
}
/** @param coprocessorHost the new coprocessor host */
public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) {
this.coprocessorHost = coprocessorHost;
}
@Override
public void startRegionOperation() throws IOException {
startRegionOperation(Operation.ANY);
}
@Override
public void startRegionOperation(Operation op) throws IOException {
boolean isInterruptableOp = false;
switch (op) {
case GET: // interruptible read operations
case SCAN:
isInterruptableOp = true;
checkReadsEnabled();
break;
case INCREMENT: // interruptible write operations
case APPEND:
case PUT:
case DELETE:
case BATCH_MUTATE:
case CHECK_AND_MUTATE:
isInterruptableOp = true;
break;
default: // all others
break;
}
if (
op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION || op == Operation.COMPACT_REGION
|| op == Operation.COMPACT_SWITCH
) {
// split, merge or compact region doesn't need to check the closing/closed state or lock the
// region
return;
}
if (this.closing.get()) {
throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing");
}
lock(lock.readLock());
// Update regionLockHolders ONLY for any startRegionOperation call that is invoked from
// an RPC handler
Thread thisThread = Thread.currentThread();
if (isInterruptableOp) {
regionLockHolders.put(thisThread, true);
}
if (this.closed.get()) {
lock.readLock().unlock();
throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed");
}
// The unit for snapshot is a region. So, all stores for this region must be
// prepared for snapshot operation before proceeding.
if (op == Operation.SNAPSHOT) {
stores.values().forEach(HStore::preSnapshotOperation);
}
try {
if (coprocessorHost != null) {
coprocessorHost.postStartRegionOperation(op);
}
} catch (Exception e) {
if (isInterruptableOp) {
// would be harmless to remove what we didn't add but we know by 'isInterruptableOp'
// if we added this thread to regionLockHolders
regionLockHolders.remove(thisThread);
}
lock.readLock().unlock();
throw new IOException(e);
}
}
@Override
public void closeRegionOperation() throws IOException {
closeRegionOperation(Operation.ANY);
}
@Override
public void closeRegionOperation(Operation operation) throws IOException {
if (operation == Operation.SNAPSHOT) {
stores.values().forEach(HStore::postSnapshotOperation);
}
Thread thisThread = Thread.currentThread();
regionLockHolders.remove(thisThread);
lock.readLock().unlock();
if (coprocessorHost != null) {
coprocessorHost.postCloseRegionOperation(operation);
}
}
/**
* This method needs to be called before any public call that reads or modifies stores in bulk. It
* has to be called just before a try. #closeBulkRegionOperation needs to be called in the try's
* finally block Acquires a writelock and checks if the region is closing or closed.
* @throws NotServingRegionException when the region is closing or closed
* @throws RegionTooBusyException if failed to get the lock in time
* @throws InterruptedIOException if interrupted while waiting for a lock
*/
private void startBulkRegionOperation(boolean writeLockNeeded) throws IOException {
if (this.closing.get()) {
throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing");
}
if (writeLockNeeded) lock(lock.writeLock());
else lock(lock.readLock());
if (this.closed.get()) {
if (writeLockNeeded) lock.writeLock().unlock();
else lock.readLock().unlock();
throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed");
}
regionLockHolders.put(Thread.currentThread(), true);
}
/**
* Closes the lock. This needs to be called in the finally block corresponding to the try block of
* #startRegionOperation
*/
private void closeBulkRegionOperation() {
regionLockHolders.remove(Thread.currentThread());
if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock();
else lock.readLock().unlock();
}
/**
* Update LongAdders for number of puts without wal and the size of possible data loss. These
* information are exposed by the region server metrics.
*/
private void recordMutationWithoutWal(final Map> familyMap) {
numMutationsWithoutWAL.increment();
if (numMutationsWithoutWAL.sum() <= 1) {
LOG.info("writing data to region " + this
+ " with WAL disabled. Data may be lost in the event of a crash.");
}
long mutationSize = 0;
for (List cells : familyMap.values()) {
// Optimization: 'foreach' loop is not used. See:
// HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects
assert cells instanceof RandomAccess;
int listSize = cells.size();
for (int i = 0; i < listSize; i++) {
Cell cell = cells.get(i);
mutationSize += cell.getSerializedSize();
}
}
dataInMemoryWithoutWAL.add(mutationSize);
}
private void lock(final Lock lock) throws IOException {
lock(lock, 1);
}
/**
* Try to acquire a lock. Throw RegionTooBusyException if failed to get the lock in time. Throw
* InterruptedIOException if interrupted while waiting for the lock.
*/
private void lock(final Lock lock, final int multiplier) throws IOException {
try {
final long waitTime = Math.min(maxBusyWaitDuration,
busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier));
if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) {
// Don't print millis. Message is used as a key over in
// RetriesExhaustedWithDetailsException processing.
final String regionName =
this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getRegionNameAsString();
final String serverName = this.getRegionServerServices() == null
? "unknown"
: (this.getRegionServerServices().getServerName() == null
? "unknown"
: this.getRegionServerServices().getServerName().toString());
RegionTooBusyException rtbe = new RegionTooBusyException(
"Failed to obtain lock; regionName=" + regionName + ", server=" + serverName);
LOG.warn("Region is too busy to allow lock acquisition.", rtbe);
throw rtbe;
}
} catch (InterruptedException ie) {
if (LOG.isDebugEnabled()) {
LOG.debug("Interrupted while waiting for a lock in region {}", this);
}
throw throwOnInterrupt(ie);
}
}
/**
* Calls sync with the given transaction ID
* @param txid should sync up to which transaction
* @throws IOException If anything goes wrong with DFS
*/
private void sync(long txid, Durability durability) throws IOException {
if (this.getRegionInfo().isMetaRegion()) {
this.wal.sync(txid);
} else {
switch (durability) {
case USE_DEFAULT:
// do what table defaults to
if (shouldSyncWAL()) {
this.wal.sync(txid);
}
break;
case SKIP_WAL:
// nothing do to
break;
case ASYNC_WAL:
// nothing do to
break;
case SYNC_WAL:
this.wal.sync(txid, false);
break;
case FSYNC_WAL:
this.wal.sync(txid, true);
break;
default:
throw new RuntimeException("Unknown durability " + durability);
}
}
}
/**
* Check whether we should sync the wal from the table's durability settings
*/
private boolean shouldSyncWAL() {
return regionDurability.ordinal() > Durability.ASYNC_WAL.ordinal();
}
/** Returns the latest sequence number that was read from storage when this region was opened */
public long getOpenSeqNum() {
return this.openSeqNum;
}
@Override
public Map getMaxStoreSeqId() {
return this.maxSeqIdInStores;
}
public long getOldestSeqIdOfStore(byte[] familyName) {
return wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), familyName);
}
@Override
public CompactionState getCompactionState() {
boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0;
return (hasMajor
? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR)
: (hasMinor ? CompactionState.MINOR : CompactionState.NONE));
}
public void reportCompactionRequestStart(boolean isMajor) {
(isMajor ? majorInProgress : minorInProgress).incrementAndGet();
}
public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted) {
int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet();
// metrics
compactionsFinished.increment();
compactionNumFilesCompacted.add(numFiles);
compactionNumBytesCompacted.add(filesSizeCompacted);
assert newValue >= 0;
}
public void reportCompactionRequestFailure() {
compactionsFailed.increment();
}
public void incrementCompactionsQueuedCount() {
compactionsQueued.increment();
}
public void decrementCompactionsQueuedCount() {
compactionsQueued.decrement();
}
public void incrementFlushesQueuedCount() {
flushesQueued.increment();
}
protected void decrementFlushesQueuedCount() {
flushesQueued.decrement();
}
/**
* If a handler thread is eligible for interrupt, make it ineligible. Should be paired with
* {{@link #enableInterrupts()}.
*/
void disableInterrupts() {
regionLockHolders.computeIfPresent(Thread.currentThread(), (t, b) -> false);
}
/**
* If a handler thread was made ineligible for interrupt via {{@link #disableInterrupts()}, make
* it eligible again. No-op if interrupts are already enabled.
*/
void enableInterrupts() {
regionLockHolders.computeIfPresent(Thread.currentThread(), (t, b) -> true);
}
/**
* Interrupt any region options that have acquired the region lock via
* {@link #startRegionOperation(org.apache.hadoop.hbase.regionserver.Region.Operation)}, or
* {@link #startBulkRegionOperation(boolean)}.
*/
private void interruptRegionOperations() {
for (Map.Entry entry : regionLockHolders.entrySet()) {
// An entry in this map will have a boolean value indicating if it is currently
// eligible for interrupt; if so, we should interrupt it.
if (entry.getValue().booleanValue()) {
entry.getKey().interrupt();
}
}
}
/**
* Check thread interrupt status and throw an exception if interrupted.
* @throws NotServingRegionException if region is closing
* @throws InterruptedIOException if interrupted but region is not closing
*/
// Package scope for tests
void checkInterrupt() throws NotServingRegionException, InterruptedIOException {
if (Thread.interrupted()) {
if (this.closing.get()) {
throw new NotServingRegionException(
getRegionInfo().getRegionNameAsString() + " is closing");
}
throw new InterruptedIOException();
}
}
/**
* Throw the correct exception upon interrupt
* @param t cause
*/
// Package scope for tests
IOException throwOnInterrupt(Throwable t) {
if (this.closing.get()) {
return (NotServingRegionException) new NotServingRegionException(
getRegionInfo().getRegionNameAsString() + " is closing").initCause(t);
}
return (InterruptedIOException) new InterruptedIOException().initCause(t);
}
/**
* {@inheritDoc}
*/
@Override
public void onConfigurationChange(Configuration conf) {
this.storeHotnessProtector.update(conf);
// update coprocessorHost if the configuration has changed.
if (
CoprocessorConfigurationUtil.checkConfigurationChange(getReadOnlyConfiguration(), conf,
CoprocessorHost.REGION_COPROCESSOR_CONF_KEY,
CoprocessorHost.USER_REGION_COPROCESSOR_CONF_KEY)
) {
LOG.info("Update the system coprocessors because the configuration has changed");
decorateRegionConfiguration(conf);
this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
}
}
/**
* {@inheritDoc}
*/
@Override
public void registerChildren(ConfigurationManager manager) {
configurationManager = manager;
stores.values().forEach(manager::registerObserver);
}
/**
* {@inheritDoc}
*/
@Override
public void deregisterChildren(ConfigurationManager manager) {
stores.values().forEach(configurationManager::deregisterObserver);
}
@Override
public CellComparator getCellComparator() {
return cellComparator;
}
public long getMemStoreFlushSize() {
return this.memstoreFlushSize;
}
//// method for debugging tests
void throwException(String title, String regionName) {
StringBuilder buf = new StringBuilder();
buf.append(title + ", ");
buf.append(getRegionInfo().toString());
buf.append(getRegionInfo().isMetaRegion() ? " meta region " : " ");
buf.append("stores: ");
for (HStore s : stores.values()) {
buf.append(s.getColumnFamilyDescriptor().getNameAsString());
buf.append(" size: ");
buf.append(s.getMemStoreSize().getDataSize());
buf.append(" ");
}
buf.append("end-of-stores");
buf.append(", memstore size ");
buf.append(getMemStoreDataSize());
if (getRegionInfo().getRegionNameAsString().startsWith(regionName)) {
throw new RuntimeException(buf.toString());
}
}
@Override
public void requestCompaction(String why, int priority, boolean major,
CompactionLifeCycleTracker tracker) throws IOException {
if (major) {
stores.values().forEach(HStore::triggerMajorCompaction);
}
rsServices.getCompactionRequestor().requestCompaction(this, why, priority, tracker,
RpcServer.getRequestUser().orElse(null));
}
@Override
public void requestCompaction(byte[] family, String why, int priority, boolean major,
CompactionLifeCycleTracker tracker) throws IOException {
HStore store = stores.get(family);
if (store == null) {
throw new NoSuchColumnFamilyException("column family " + Bytes.toString(family)
+ " does not exist in region " + getRegionInfo().getRegionNameAsString());
}
if (major) {
store.triggerMajorCompaction();
}
rsServices.getCompactionRequestor().requestCompaction(this, store, why, priority, tracker,
RpcServer.getRequestUser().orElse(null));
}
private void requestFlushIfNeeded() throws RegionTooBusyException {
if (isFlushSize(this.memStoreSizing.getMemStoreSize())) {
requestFlush();
}
}
private void requestFlush() {
if (this.rsServices == null) {
return;
}
requestFlush0(FlushLifeCycleTracker.DUMMY);
}
private void requestFlush0(FlushLifeCycleTracker tracker) {
boolean shouldFlush = false;
synchronized (writestate) {
if (!this.writestate.isFlushRequested()) {
shouldFlush = true;
writestate.flushRequested = true;
}
}
if (shouldFlush) {
// Make request outside of synchronize block; HBASE-818.
this.rsServices.getFlushRequester().requestFlush(this, tracker);
if (LOG.isDebugEnabled()) {
LOG.debug("Flush requested on " + this.getRegionInfo().getEncodedName());
}
} else {
tracker.notExecuted("Flush already requested on " + this);
}
}
@Override
public void requestFlush(FlushLifeCycleTracker tracker) throws IOException {
requestFlush0(tracker);
}
/**
* This method modifies the region's configuration in order to inject replication-related features
* @param conf region configurations
*/
private static void decorateRegionConfiguration(Configuration conf) {
if (ReplicationUtils.isReplicationForBulkLoadDataEnabled(conf)) {
String plugins = conf.get(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, "");
String replicationCoprocessorClass = ReplicationObserver.class.getCanonicalName();
if (!plugins.contains(replicationCoprocessorClass)) {
conf.set(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY,
(plugins.equals("") ? "" : (plugins + ",")) + replicationCoprocessorClass);
}
}
}
public void addReadRequestsCount(long readRequestsCount) {
this.readRequestsCount.add(readRequestsCount);
}
public void addWriteRequestsCount(long writeRequestsCount) {
this.writeRequestsCount.add(writeRequestsCount);
}
@RestrictedApi(explanation = "Should only be called in tests", link = "",
allowedOnPath = ".*/src/test/.*")
boolean isReadsEnabled() {
return this.writestate.readsEnabled;
}
}
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |