All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.regionserver.HRegion Maven / Gradle / Ivy

There is a newer version: 3.0.0-beta-1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.regionserver;

import static org.apache.hadoop.hbase.HConstants.REPLICATION_SCOPE_LOCAL;
import static org.apache.hadoop.hbase.regionserver.HStoreFile.MAJOR_COMPACTION_KEY;
import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent;

import edu.umd.cs.findbugs.annotations.Nullable;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.lang.reflect.Constructor;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.NavigableSet;
import java.util.Objects;
import java.util.Optional;
import java.util.RandomAccess;
import java.util.Set;
import java.util.TreeMap;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.LongAdder;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellBuilderType;
import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.CellComparatorImpl;
import org.apache.hadoop.hbase.CellScanner;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.CompareOperator;
import org.apache.hadoop.hbase.CompoundConfiguration;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.DroppedSnapshotException;
import org.apache.hadoop.hbase.ExtendedCellBuilderFactory;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
import org.apache.hadoop.hbase.HDFSBlocksDistribution;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.MetaCellComparator;
import org.apache.hadoop.hbase.NamespaceDescriptor;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.PrivateCellUtil;
import org.apache.hadoop.hbase.RegionTooBusyException;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.Tag;
import org.apache.hadoop.hbase.TagUtil;
import org.apache.hadoop.hbase.UnknownScannerException;
import org.apache.hadoop.hbase.client.Append;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.CompactionState;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Increment;
import org.apache.hadoop.hbase.client.IsolationLevel;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.PackagePrivateFieldAccessor;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Row;
import org.apache.hadoop.hbase.client.RowMutations;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.conf.ConfigurationManager;
import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver;
import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
import org.apache.hadoop.hbase.coprocessor.ReadOnlyConfiguration;
import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
import org.apache.hadoop.hbase.exceptions.TimeoutIOException;
import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
import org.apache.hadoop.hbase.filter.ByteArrayComparable;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.FilterWrapper;
import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
import org.apache.hadoop.hbase.io.HFileLink;
import org.apache.hadoop.hbase.io.HeapSize;
import org.apache.hadoop.hbase.io.TimeRange;
import org.apache.hadoop.hbase.io.hfile.BlockCache;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.ipc.CallerDisconnectedException;
import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils;
import org.apache.hadoop.hbase.ipc.RpcCall;
import org.apache.hadoop.hbase.ipc.RpcServer;
import org.apache.hadoop.hbase.mob.MobFileCache;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.quotas.RegionServerSpaceQuotaManager;
import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl.WriteEntry;
import org.apache.hadoop.hbase.regionserver.ScannerContext.LimitScope;
import org.apache.hadoop.hbase.regionserver.ScannerContext.NextState;
import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
import org.apache.hadoop.hbase.regionserver.compactions.CompactionLifeCycleTracker;
import org.apache.hadoop.hbase.regionserver.throttle.CompactionThroughputControllerFactory;
import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController;
import org.apache.hadoop.hbase.regionserver.throttle.StoreHotnessProtector;
import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController;
import org.apache.hadoop.hbase.regionserver.wal.WALUtil;
import org.apache.hadoop.hbase.replication.ReplicationUtils;
import org.apache.hadoop.hbase.replication.regionserver.ReplicationObserver;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CancelableProgressable;
import org.apache.hadoop.hbase.util.ClassSize;
import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.HashedBytes;
import org.apache.hadoop.hbase.util.NonceKey;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
import org.apache.hadoop.hbase.util.TableDescriptorChecker;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.wal.WAL;
import org.apache.hadoop.hbase.wal.WALEdit;
import org.apache.hadoop.hbase.wal.WALFactory;
import org.apache.hadoop.hbase.wal.WALKey;
import org.apache.hadoop.hbase.wal.WALKeyImpl;
import org.apache.hadoop.hbase.wal.WALSplitUtil;
import org.apache.hadoop.hbase.wal.WALSplitUtil.MutationReplay;
import org.apache.hadoop.util.StringUtils;
import org.apache.htrace.core.TraceScope;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
import org.apache.hbase.thirdparty.com.google.protobuf.Service;
import org.apache.hbase.thirdparty.com.google.protobuf.TextFormat;
import org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations;
import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils;

import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.CoprocessorServiceCall;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.RegionLoad;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.CompactionDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.FlushAction;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor.EventType;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.StoreDescriptor;

/**
 * Regions store data for a certain region of a table.  It stores all columns
 * for each row. A given table consists of one or more Regions.
 *
 * 

An Region is defined by its table and its key extent. * *

Locking at the Region level serves only one purpose: preventing the * region from being closed (and consequently split) while other operations * are ongoing. Each row level operation obtains both a row lock and a region * read lock for the duration of the operation. While a scanner is being * constructed, getScanner holds a read lock. If the scanner is successfully * constructed, it holds a read lock until it is closed. A close takes out a * write lock and consequently will block for ongoing operations and will block * new operations from starting while the close is in progress. */ @SuppressWarnings("deprecation") @InterfaceAudience.Private public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region { private static final Logger LOG = LoggerFactory.getLogger(HRegion.class); public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY = "hbase.hregion.scan.loadColumnFamiliesOnDemand"; public static final String HBASE_MAX_CELL_SIZE_KEY = "hbase.server.keyvalue.maxsize"; public static final int DEFAULT_MAX_CELL_SIZE = 10485760; public static final String HBASE_REGIONSERVER_MINIBATCH_SIZE = "hbase.regionserver.minibatch.size"; public static final int DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE = 20000; public static final String WAL_HSYNC_CONF_KEY = "hbase.wal.hsync"; public static final boolean DEFAULT_WAL_HSYNC = false; /** * This is for for using HRegion as a local storage, where we may put the recovered edits in a * special place. Once this is set, we will only replay the recovered edits under this directory * and ignore the original replay directory configs. */ public static final String SPECIAL_RECOVERED_EDITS_DIR = "hbase.hregion.special.recovered.edits.dir"; /** * Whether to use {@link MetaCellComparator} even if we are not meta region. Used when creating * master local region. */ public static final String USE_META_CELL_COMPARATOR = "hbase.region.use.meta.cell.comparator"; public static final boolean DEFAULT_USE_META_CELL_COMPARATOR = false; final AtomicBoolean closed = new AtomicBoolean(false); /* Closing can take some time; use the closing flag if there is stuff we don't * want to do while in closing state; e.g. like offer this region up to the * master as a region to close if the carrying regionserver is overloaded. * Once set, it is never cleared. */ final AtomicBoolean closing = new AtomicBoolean(false); /** * The max sequence id of flushed data on this region. There is no edit in memory that is * less that this sequence id. */ private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM; /** * Record the sequence id of last flush operation. Can be in advance of * {@link #maxFlushedSeqId} when flushing a single column family. In this case, * {@link #maxFlushedSeqId} will be older than the oldest edit in memory. */ private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM; /** * The sequence id of the last replayed open region event from the primary region. This is used * to skip entries before this due to the possibility of replay edits coming out of order from * replication. */ protected volatile long lastReplayedOpenRegionSeqId = -1L; protected volatile long lastReplayedCompactionSeqId = -1L; ////////////////////////////////////////////////////////////////////////////// // Members ////////////////////////////////////////////////////////////////////////////// // map from a locked row to the context for that lock including: // - CountDownLatch for threads waiting on that row // - the thread that owns the lock (allow reentrancy) // - reference count of (reentrant) locks held by the thread // - the row itself private final ConcurrentHashMap lockedRows = new ConcurrentHashMap<>(); protected final Map stores = new ConcurrentSkipListMap<>(Bytes.BYTES_RAWCOMPARATOR); // TODO: account for each registered handler in HeapSize computation private Map coprocessorServiceHandlers = Maps.newHashMap(); // Track data size in all memstores private final MemStoreSizing memStoreSizing = new ThreadSafeMemStoreSizing(); @VisibleForTesting RegionServicesForStores regionServicesForStores; // Debug possible data loss due to WAL off final LongAdder numMutationsWithoutWAL = new LongAdder(); final LongAdder dataInMemoryWithoutWAL = new LongAdder(); // Debug why CAS operations are taking a while. final LongAdder checkAndMutateChecksPassed = new LongAdder(); final LongAdder checkAndMutateChecksFailed = new LongAdder(); // Number of requests // Count rows for scan final LongAdder readRequestsCount = new LongAdder(); final LongAdder filteredReadRequestsCount = new LongAdder(); // Count rows for multi row mutations final LongAdder writeRequestsCount = new LongAdder(); // Number of requests blocked by memstore size. private final LongAdder blockedRequestsCount = new LongAdder(); // Compaction LongAdders final LongAdder compactionsFinished = new LongAdder(); final LongAdder compactionsFailed = new LongAdder(); final LongAdder compactionNumFilesCompacted = new LongAdder(); final LongAdder compactionNumBytesCompacted = new LongAdder(); final LongAdder compactionsQueued = new LongAdder(); final LongAdder flushesQueued = new LongAdder(); private BlockCache blockCache; private MobFileCache mobFileCache; private final WAL wal; private final HRegionFileSystem fs; protected final Configuration conf; private final Configuration baseConf; private final int rowLockWaitDuration; static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000; private Path regionDir; private FileSystem walFS; // set to true if the region is restored from snapshot private boolean isRestoredRegion = false; public void setRestoredRegion(boolean restoredRegion) { isRestoredRegion = restoredRegion; } // The internal wait duration to acquire a lock before read/update // from the region. It is not per row. The purpose of this wait time // is to avoid waiting a long time while the region is busy, so that // we can release the IPC handler soon enough to improve the // availability of the region server. It can be adjusted by // tuning configuration "hbase.busy.wait.duration". final long busyWaitDuration; static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT; // If updating multiple rows in one call, wait longer, // i.e. waiting for busyWaitDuration * # of rows. However, // we can limit the max multiplier. final int maxBusyWaitMultiplier; // Max busy wait duration. There is no point to wait longer than the RPC // purge timeout, when a RPC call will be terminated by the RPC engine. final long maxBusyWaitDuration; // Max cell size. If nonzero, the maximum allowed size for any given cell // in bytes final long maxCellSize; // Number of mutations for minibatch processing. private final int miniBatchSize; // negative number indicates infinite timeout static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L; final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool(); private final ConcurrentHashMap scannerReadPoints; /** * The sequence ID that was enLongAddered when this region was opened. */ private long openSeqNum = HConstants.NO_SEQNUM; /** * The default setting for whether to enable on-demand CF loading for * scan requests to this region. Requests can override it. */ private boolean isLoadingCfsOnDemandDefault = false; private final AtomicInteger majorInProgress = new AtomicInteger(0); private final AtomicInteger minorInProgress = new AtomicInteger(0); // // Context: During replay we want to ensure that we do not lose any data. So, we // have to be conservative in how we replay wals. For each store, we calculate // the maxSeqId up to which the store was flushed. And, skip the edits which // are equal to or lower than maxSeqId for each store. // The following map is populated when opening the region Map maxSeqIdInStores = new TreeMap<>(Bytes.BYTES_COMPARATOR); /** Saved state from replaying prepare flush cache */ private PrepareFlushResult prepareFlushResult = null; private volatile ConfigurationManager configurationManager; // Used for testing. private volatile Long timeoutForWriteLock = null; private final CellComparator cellComparator; /** * @return The smallest mvcc readPoint across all the scanners in this * region. Writes older than this readPoint, are included in every * read operation. */ public long getSmallestReadPoint() { long minimumReadPoint; // We need to ensure that while we are calculating the smallestReadPoint // no new RegionScanners can grab a readPoint that we are unaware of. // We achieve this by synchronizing on the scannerReadPoints object. synchronized (scannerReadPoints) { minimumReadPoint = mvcc.getReadPoint(); for (Long readPoint : this.scannerReadPoints.values()) { if (readPoint < minimumReadPoint) { minimumReadPoint = readPoint; } } } return minimumReadPoint; } /* * Data structure of write state flags used coordinating flushes, * compactions and closes. */ static class WriteState { // Set while a memstore flush is happening. volatile boolean flushing = false; // Set when a flush has been requested. volatile boolean flushRequested = false; // Number of compactions running. AtomicInteger compacting = new AtomicInteger(0); // Gets set in close. If set, cannot compact or flush again. volatile boolean writesEnabled = true; // Set if region is read-only volatile boolean readOnly = false; // whether the reads are enabled. This is different than readOnly, because readOnly is // static in the lifetime of the region, while readsEnabled is dynamic volatile boolean readsEnabled = true; /** * Set flags that make this region read-only. * * @param onOff flip value for region r/o setting */ synchronized void setReadOnly(final boolean onOff) { this.writesEnabled = !onOff; this.readOnly = onOff; } boolean isReadOnly() { return this.readOnly; } boolean isFlushRequested() { return this.flushRequested; } void setReadsEnabled(boolean readsEnabled) { this.readsEnabled = readsEnabled; } static final long HEAP_SIZE = ClassSize.align( ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN); } /** * Objects from this class are created when flushing to describe all the different states that * that method ends up in. The Result enum describes those states. The sequence id should only * be specified if the flush was successful, and the failure message should only be specified * if it didn't flush. */ public static class FlushResultImpl implements FlushResult { final Result result; final String failureReason; final long flushSequenceId; final boolean wroteFlushWalMarker; /** * Convenience constructor to use when the flush is successful, the failure message is set to * null. * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED. * @param flushSequenceId Generated sequence id that comes right after the edits in the * memstores. */ FlushResultImpl(Result result, long flushSequenceId) { this(result, flushSequenceId, null, false); assert result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result .FLUSHED_COMPACTION_NEEDED; } /** * Convenience constructor to use when we cannot flush. * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH. * @param failureReason Reason why we couldn't flush. */ FlushResultImpl(Result result, String failureReason, boolean wroteFlushMarker) { this(result, -1, failureReason, wroteFlushMarker); assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH; } /** * Constructor with all the parameters. * @param result Any of the Result. * @param flushSequenceId Generated sequence id if the memstores were flushed else -1. * @param failureReason Reason why we couldn't flush, or null. */ FlushResultImpl(Result result, long flushSequenceId, String failureReason, boolean wroteFlushMarker) { this.result = result; this.flushSequenceId = flushSequenceId; this.failureReason = failureReason; this.wroteFlushWalMarker = wroteFlushMarker; } /** * Convenience method, the equivalent of checking if result is * FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_NO_COMPACTION_NEEDED. * @return true if the memstores were flushed, else false. */ @Override public boolean isFlushSucceeded() { return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result .FLUSHED_COMPACTION_NEEDED; } /** * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED. * @return True if the flush requested a compaction, else false (doesn't even mean it flushed). */ @Override public boolean isCompactionNeeded() { return result == Result.FLUSHED_COMPACTION_NEEDED; } @Override public String toString() { return new StringBuilder() .append("flush result:").append(result).append(", ") .append("failureReason:").append(failureReason).append(",") .append("flush seq id").append(flushSequenceId).toString(); } @Override public Result getResult() { return result; } } /** A result object from prepare flush cache stage */ @VisibleForTesting static class PrepareFlushResult { final FlushResultImpl result; // indicating a failure result from prepare final TreeMap storeFlushCtxs; final TreeMap> committedFiles; final TreeMap storeFlushableSize; final long startTime; final long flushOpSeqId; final long flushedSeqId; final MemStoreSizing totalFlushableSize; /** Constructs an early exit case */ PrepareFlushResult(FlushResultImpl result, long flushSeqId) { this(result, null, null, null, Math.max(0, flushSeqId), 0, 0, MemStoreSizing.DUD); } /** Constructs a successful prepare flush result */ PrepareFlushResult( TreeMap storeFlushCtxs, TreeMap> committedFiles, TreeMap storeFlushableSize, long startTime, long flushSeqId, long flushedSeqId, MemStoreSizing totalFlushableSize) { this(null, storeFlushCtxs, committedFiles, storeFlushableSize, startTime, flushSeqId, flushedSeqId, totalFlushableSize); } private PrepareFlushResult( FlushResultImpl result, TreeMap storeFlushCtxs, TreeMap> committedFiles, TreeMap storeFlushableSize, long startTime, long flushSeqId, long flushedSeqId, MemStoreSizing totalFlushableSize) { this.result = result; this.storeFlushCtxs = storeFlushCtxs; this.committedFiles = committedFiles; this.storeFlushableSize = storeFlushableSize; this.startTime = startTime; this.flushOpSeqId = flushSeqId; this.flushedSeqId = flushedSeqId; this.totalFlushableSize = totalFlushableSize; } public FlushResult getResult() { return this.result; } } /** * A class that tracks exceptions that have been observed in one batch. Not thread safe. */ static class ObservedExceptionsInBatch { private boolean wrongRegion = false; private boolean failedSanityCheck = false; private boolean wrongFamily = false; /** * @return If a {@link WrongRegionException} has been observed. */ boolean hasSeenWrongRegion() { return wrongRegion; } /** * Records that a {@link WrongRegionException} has been observed. */ void sawWrongRegion() { wrongRegion = true; } /** * @return If a {@link FailedSanityCheckException} has been observed. */ boolean hasSeenFailedSanityCheck() { return failedSanityCheck; } /** * Records that a {@link FailedSanityCheckException} has been observed. */ void sawFailedSanityCheck() { failedSanityCheck = true; } /** * @return If a {@link NoSuchColumnFamilyException} has been observed. */ boolean hasSeenNoSuchFamily() { return wrongFamily; } /** * Records that a {@link NoSuchColumnFamilyException} has been observed. */ void sawNoSuchFamily() { wrongFamily = true; } } final WriteState writestate = new WriteState(); long memstoreFlushSize; final long timestampSlop; final long rowProcessorTimeout; // Last flush time for each Store. Useful when we are flushing for each column private final ConcurrentMap lastStoreFlushTimeMap = new ConcurrentHashMap<>(); final RegionServerServices rsServices; private RegionServerAccounting rsAccounting; private long flushCheckInterval; // flushPerChanges is to prevent too many changes in memstore private long flushPerChanges; private long blockingMemStoreSize; // Used to guard closes final ReentrantReadWriteLock lock; // Stop updates lock private final ReentrantReadWriteLock updatesLock = new ReentrantReadWriteLock(); private boolean splitRequest; private byte[] explicitSplitPoint = null; private final MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(); // Coprocessor host private RegionCoprocessorHost coprocessorHost; private TableDescriptor htableDescriptor = null; private RegionSplitPolicy splitPolicy; private FlushPolicy flushPolicy; private final MetricsRegion metricsRegion; private final MetricsRegionWrapperImpl metricsRegionWrapper; private final Durability regionDurability; private final boolean regionStatsEnabled; // Stores the replication scope of the various column families of the table // that has non-default scope private final NavigableMap replicationScope = new TreeMap<>( Bytes.BYTES_COMPARATOR); private final StoreHotnessProtector storeHotnessProtector; /** * HRegion constructor. This constructor should only be used for testing and * extensions. Instances of HRegion should be instantiated with the * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method. * * @param tableDir qualified path of directory where region should be located, * usually the table directory. * @param wal The WAL is the outbound log for any updates to the HRegion * The wal file is a logfile from the previous execution that's * custom-computed for this HRegion. The HRegionServer computes and sorts the * appropriate wal info for this HRegion. If there is a previous wal file * (implying that the HRegion has been written-to before), then read it from * the supplied path. * @param fs is the filesystem. * @param confParam is global configuration settings. * @param regionInfo - RegionInfo that describes the region * is new), then read them from the supplied path. * @param htd the table descriptor * @param rsServices reference to {@link RegionServerServices} or null * @deprecated Use other constructors. */ @Deprecated @VisibleForTesting public HRegion(final Path tableDir, final WAL wal, final FileSystem fs, final Configuration confParam, final RegionInfo regionInfo, final TableDescriptor htd, final RegionServerServices rsServices) { this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo), wal, confParam, htd, rsServices); } /** * HRegion constructor. This constructor should only be used for testing and * extensions. Instances of HRegion should be instantiated with the * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method. * * @param fs is the filesystem. * @param wal The WAL is the outbound log for any updates to the HRegion * The wal file is a logfile from the previous execution that's * custom-computed for this HRegion. The HRegionServer computes and sorts the * appropriate wal info for this HRegion. If there is a previous wal file * (implying that the HRegion has been written-to before), then read it from * the supplied path. * @param confParam is global configuration settings. * @param htd the table descriptor * @param rsServices reference to {@link RegionServerServices} or null */ public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam, final TableDescriptor htd, final RegionServerServices rsServices) { if (htd == null) { throw new IllegalArgumentException("Need table descriptor"); } if (confParam instanceof CompoundConfiguration) { throw new IllegalArgumentException("Need original base configuration"); } this.wal = wal; this.fs = fs; // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor this.baseConf = confParam; this.conf = new CompoundConfiguration().add(confParam).addBytesMap(htd.getValues()); this.cellComparator = htd.isMetaTable() || conf.getBoolean(USE_META_CELL_COMPARATOR, DEFAULT_USE_META_CELL_COMPARATOR) ? MetaCellComparator.META_COMPARATOR : CellComparatorImpl.COMPARATOR; this.lock = new ReentrantReadWriteLock(conf.getBoolean(FAIR_REENTRANT_CLOSE_LOCK, DEFAULT_FAIR_REENTRANT_CLOSE_LOCK)); this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL, DEFAULT_CACHE_FLUSH_INTERVAL); this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES); if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) { throw new IllegalArgumentException(MEMSTORE_FLUSH_PER_CHANGES + " can not exceed " + MAX_FLUSH_PER_CHANGES); } int tmpRowLockDuration = conf.getInt("hbase.rowlock.wait.duration", DEFAULT_ROWLOCK_WAIT_DURATION); if (tmpRowLockDuration <= 0) { LOG.info("Found hbase.rowlock.wait.duration set to {}. values <= 0 will cause all row " + "locking to fail. Treating it as 1ms to avoid region failure.", tmpRowLockDuration); tmpRowLockDuration = 1; } this.rowLockWaitDuration = tmpRowLockDuration; this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true); this.htableDescriptor = htd; Set families = this.htableDescriptor.getColumnFamilyNames(); for (byte[] family : families) { if (!replicationScope.containsKey(family)) { int scope = htd.getColumnFamily(family).getScope(); // Only store those families that has NON-DEFAULT scope if (scope != REPLICATION_SCOPE_LOCAL) { // Do a copy before storing it here. replicationScope.put(Bytes.copy(family), scope); } } } this.rsServices = rsServices; if (rsServices != null) { this.blockCache = rsServices.getBlockCache().orElse(null); this.mobFileCache = rsServices.getMobFileCache().orElse(null); } this.regionServicesForStores = new RegionServicesForStores(this, rsServices); setHTableSpecificConf(); this.scannerReadPoints = new ConcurrentHashMap<>(); this.busyWaitDuration = conf.getLong( "hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION); this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2); if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) { throw new IllegalArgumentException("Invalid hbase.busy.wait.duration (" + busyWaitDuration + ") or hbase.busy.wait.multiplier.max (" + maxBusyWaitMultiplier + "). Their product should be positive"); } this.maxBusyWaitDuration = conf.getLong("hbase.ipc.client.call.purge.timeout", 2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT); /* * timestamp.slop provides a server-side constraint on the timestamp. This * assumes that you base your TS around currentTimeMillis(). In this case, * throw an error to the user if the user-specified TS is newer than now + * slop. LATEST_TIMESTAMP == don't use this functionality */ this.timestampSlop = conf.getLong( "hbase.hregion.keyvalue.timestamp.slop.millisecs", HConstants.LATEST_TIMESTAMP); /** * Timeout for the process time in processRowsWithLocks(). * Use -1 to switch off time bound. */ this.rowProcessorTimeout = conf.getLong( "hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT); this.storeHotnessProtector = new StoreHotnessProtector(this, conf); boolean forceSync = conf.getBoolean(WAL_HSYNC_CONF_KEY, DEFAULT_WAL_HSYNC); /** * This is the global default value for durability. All tables/mutations not defining a * durability or using USE_DEFAULT will default to this value. */ Durability defaultDurability = forceSync ? Durability.FSYNC_WAL : Durability.SYNC_WAL; this.regionDurability = this.htableDescriptor.getDurability() == Durability.USE_DEFAULT ? defaultDurability : this.htableDescriptor.getDurability(); decorateRegionConfiguration(conf); if (rsServices != null) { this.rsAccounting = this.rsServices.getRegionServerAccounting(); // don't initialize coprocessors if not running within a regionserver // TODO: revisit if coprocessors should load in other cases this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf); this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this); this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper, conf); } else { this.metricsRegionWrapper = null; this.metricsRegion = null; } if (LOG.isDebugEnabled()) { // Write out region name, its encoded name and storeHotnessProtector as string. LOG.debug("Instantiated " + this +"; "+ storeHotnessProtector.toString()); } configurationManager = null; // disable stats tracking system tables, but check the config for everything else this.regionStatsEnabled = htd.getTableName().getNamespaceAsString().equals( NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) ? false : conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE, HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE); this.maxCellSize = conf.getLong(HBASE_MAX_CELL_SIZE_KEY, DEFAULT_MAX_CELL_SIZE); this.miniBatchSize = conf.getInt(HBASE_REGIONSERVER_MINIBATCH_SIZE, DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE); // recover the metrics of read and write requests count if they were retained if (rsServices != null && rsServices.getRegionServerAccounting() != null) { Pair retainedRWRequestsCnt = rsServices.getRegionServerAccounting() .getRetainedRegionRWRequestsCnt().get(getRegionInfo().getEncodedName()); if (retainedRWRequestsCnt != null) { this.setReadRequestsCount(retainedRWRequestsCnt.getFirst()); this.setWriteRequestsCount(retainedRWRequestsCnt.getSecond()); // remove them since won't use again rsServices.getRegionServerAccounting().getRetainedRegionRWRequestsCnt() .remove(getRegionInfo().getEncodedName()); } } } void setHTableSpecificConf() { if (this.htableDescriptor == null) return; long flushSize = this.htableDescriptor.getMemStoreFlushSize(); if (flushSize <= 0) { flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE, TableDescriptorBuilder.DEFAULT_MEMSTORE_FLUSH_SIZE); } this.memstoreFlushSize = flushSize; long mult = conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER, HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER); this.blockingMemStoreSize = this.memstoreFlushSize * mult; } /** * Initialize this region. * Used only by tests and SplitTransaction to reopen the region. * You should use createHRegion() or openHRegion() * @return What the next sequence (edit) id should be. * @throws IOException e * @deprecated use HRegion.createHRegion() or HRegion.openHRegion() */ @Deprecated public long initialize() throws IOException { return initialize(null); } /** * Initialize this region. * * @param reporter Tickle every so often if initialize is taking a while. * @return What the next sequence (edit) id should be. * @throws IOException e */ @VisibleForTesting long initialize(final CancelableProgressable reporter) throws IOException { //Refuse to open the region if there is no column family in the table if (htableDescriptor.getColumnFamilyCount() == 0) { throw new DoNotRetryIOException("Table " + htableDescriptor.getTableName().getNameAsString()+ " should have at least one column family."); } MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this); status.enableStatusJournal(true); long nextSeqId = -1; try { nextSeqId = initializeRegionInternals(reporter, status); return nextSeqId; } catch (IOException e) { LOG.warn("Failed initialize of region= {}, starting to roll back memstore", getRegionInfo().getRegionNameAsString(), e); // global memstore size will be decreased when dropping memstore try { //drop the memory used by memstore if open region fails dropMemStoreContents(); } catch (IOException ioE) { if (conf.getBoolean(MemStoreLAB.USEMSLAB_KEY, MemStoreLAB.USEMSLAB_DEFAULT)) { LOG.warn("Failed drop memstore of region= {}, " + "some chunks may not released forever since MSLAB is enabled", getRegionInfo().getRegionNameAsString()); } } throw e; } finally { // nextSeqid will be -1 if the initialization fails. // At least it will be 0 otherwise. if (nextSeqId == -1) { status.abort("Exception during region " + getRegionInfo().getRegionNameAsString() + " initialization."); } if (LOG.isDebugEnabled()) { LOG.debug("Region open journal for {}:\n{}", this.getRegionInfo().getEncodedName(), status.prettyPrintJournal()); } status.cleanup(); } } private long initializeRegionInternals(final CancelableProgressable reporter, final MonitoredTask status) throws IOException { if (coprocessorHost != null) { status.setStatus("Running coprocessor pre-open hook"); coprocessorHost.preOpen(); } // Write HRI to a file in case we need to recover hbase:meta // Only the primary replica should write .regioninfo if (this.getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID) { status.setStatus("Writing region info on filesystem"); fs.checkRegionInfoOnFilesystem(); } // Initialize all the HStores status.setStatus("Initializing all the Stores"); long maxSeqId = initializeStores(reporter, status); this.mvcc.advanceTo(maxSeqId); if (!isRestoredRegion && ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this)) { Collection stores = this.stores.values(); try { // update the stores that we are replaying stores.forEach(HStore::startReplayingFromWAL); // Recover any edits if available. maxSeqId = Math.max(maxSeqId, replayRecoveredEditsIfAny(maxSeqIdInStores, reporter, status)); // Recover any hfiles if available maxSeqId = Math.max(maxSeqId, loadRecoveredHFilesIfAny(stores)); // Make sure mvcc is up to max. this.mvcc.advanceTo(maxSeqId); } finally { // update the stores that we are done replaying stores.forEach(HStore::stopReplayingFromWAL); } } this.lastReplayedOpenRegionSeqId = maxSeqId; this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this)); this.writestate.flushRequested = false; this.writestate.compacting.set(0); if (this.writestate.writesEnabled) { // Remove temporary data left over from old regions status.setStatus("Cleaning up temporary data from old regions"); fs.cleanupTempDir(); } if (this.writestate.writesEnabled) { status.setStatus("Cleaning up detritus from prior splits"); // Get rid of any splits or merges that were lost in-progress. Clean out // these directories here on open. We may be opening a region that was // being split but we crashed in the middle of it all. fs.cleanupAnySplitDetritus(); fs.cleanupMergesDir(); } // Initialize split policy this.splitPolicy = RegionSplitPolicy.create(this, conf); // Initialize flush policy this.flushPolicy = FlushPolicyFactory.create(this, conf); long lastFlushTime = EnvironmentEdgeManager.currentTime(); for (HStore store: stores.values()) { this.lastStoreFlushTimeMap.put(store, lastFlushTime); } // Use maximum of log sequenceid or that which was found in stores // (particularly if no recovered edits, seqid will be -1). long nextSeqId = maxSeqId + 1; if (!isRestoredRegion) { // always get openSeqNum from the default replica, even if we are secondary replicas long maxSeqIdFromFile = WALSplitUtil.getMaxRegionSequenceId(conf, RegionReplicaUtil.getRegionInfoForDefaultReplica(getRegionInfo()), this::getFilesystem, this::getWalFileSystem); nextSeqId = Math.max(maxSeqId, maxSeqIdFromFile) + 1; // The openSeqNum will always be increase even for read only region, as we rely on it to // determine whether a region has been successfully reopened, so here we always need to update // the max sequence id file. if (RegionReplicaUtil.isDefaultReplica(getRegionInfo())) { LOG.debug("writing seq id for {}", this.getRegionInfo().getEncodedName()); WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(), nextSeqId - 1); // This means we have replayed all the recovered edits and also written out the max sequence // id file, let's delete the wrong directories introduced in HBASE-20734, see HBASE-22617 // for more details. Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf, getRegionInfo().getTable(), getRegionInfo().getEncodedName()); FileSystem walFs = getWalFileSystem(); if (walFs.exists(wrongRegionWALDir)) { if (!walFs.delete(wrongRegionWALDir, true)) { LOG.debug("Failed to clean up wrong region WAL directory {}", wrongRegionWALDir); } } } } LOG.info("Opened {}; next sequenceid={}", this.getRegionInfo().getShortNameToLog(), nextSeqId); // A region can be reopened if failed a split; reset flags this.closing.set(false); this.closed.set(false); if (coprocessorHost != null) { status.setStatus("Running coprocessor post-open hooks"); coprocessorHost.postOpen(); } status.markComplete("Region opened successfully"); return nextSeqId; } /** * Open all Stores. * @param reporter * @param status * @return Highest sequenceId found out in a Store. * @throws IOException */ private long initializeStores(CancelableProgressable reporter, MonitoredTask status) throws IOException { return initializeStores(reporter, status, false); } private long initializeStores(CancelableProgressable reporter, MonitoredTask status, boolean warmup) throws IOException { // Load in all the HStores. long maxSeqId = -1; // initialized to -1 so that we pick up MemstoreTS from column families long maxMemstoreTS = -1; if (htableDescriptor.getColumnFamilyCount() != 0) { // initialize the thread pool for opening stores in parallel. ThreadPoolExecutor storeOpenerThreadPool = getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog()); CompletionService completionService = new ExecutorCompletionService<>(storeOpenerThreadPool); // initialize each store in parallel for (final ColumnFamilyDescriptor family : htableDescriptor.getColumnFamilies()) { status.setStatus("Instantiating store for column family " + family); completionService.submit(new Callable() { @Override public HStore call() throws IOException { return instantiateHStore(family, warmup); } }); } boolean allStoresOpened = false; boolean hasSloppyStores = false; try { for (int i = 0; i < htableDescriptor.getColumnFamilyCount(); i++) { Future future = completionService.take(); HStore store = future.get(); this.stores.put(store.getColumnFamilyDescriptor().getName(), store); if (store.isSloppyMemStore()) { hasSloppyStores = true; } long storeMaxSequenceId = store.getMaxSequenceId().orElse(0L); maxSeqIdInStores.put(Bytes.toBytes(store.getColumnFamilyName()), storeMaxSequenceId); if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) { maxSeqId = storeMaxSequenceId; } long maxStoreMemstoreTS = store.getMaxMemStoreTS().orElse(0L); if (maxStoreMemstoreTS > maxMemstoreTS) { maxMemstoreTS = maxStoreMemstoreTS; } } allStoresOpened = true; if(hasSloppyStores) { htableDescriptor = TableDescriptorBuilder.newBuilder(htableDescriptor) .setFlushPolicyClassName(FlushNonSloppyStoresFirstPolicy.class.getName()) .build(); LOG.info("Setting FlushNonSloppyStoresFirstPolicy for the region=" + this); } } catch (InterruptedException e) { throw (InterruptedIOException)new InterruptedIOException().initCause(e); } catch (ExecutionException e) { throw new IOException(e.getCause()); } finally { storeOpenerThreadPool.shutdownNow(); if (!allStoresOpened) { // something went wrong, close all opened stores LOG.error("Could not initialize all stores for the region=" + this); for (HStore store : this.stores.values()) { try { store.close(); } catch (IOException e) { LOG.warn("close store {} failed in region {}", store.toString(), this, e); } } } } } return Math.max(maxSeqId, maxMemstoreTS + 1); } private void initializeWarmup(final CancelableProgressable reporter) throws IOException { MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this); // Initialize all the HStores status.setStatus("Warming up all the Stores"); try { initializeStores(reporter, status, true); } finally { status.markComplete("Done warming up."); } } /** * @return Map of StoreFiles by column family */ private NavigableMap> getStoreFiles() { NavigableMap> allStoreFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); for (HStore store : stores.values()) { Collection storeFiles = store.getStorefiles(); if (storeFiles == null) { continue; } List storeFileNames = new ArrayList<>(); for (HStoreFile storeFile : storeFiles) { storeFileNames.add(storeFile.getPath()); } allStoreFiles.put(store.getColumnFamilyDescriptor().getName(), storeFileNames); } return allStoreFiles; } @VisibleForTesting protected void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException { Map> storeFiles = getStoreFiles(); RegionEventDescriptor regionOpenDesc = ProtobufUtil.toRegionEventDescriptor( RegionEventDescriptor.EventType.REGION_OPEN, getRegionInfo(), openSeqId, getRegionServerServices().getServerName(), storeFiles); WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionOpenDesc, mvcc); } private void writeRegionCloseMarker(WAL wal) throws IOException { Map> storeFiles = getStoreFiles(); RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor( RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), mvcc.getReadPoint(), getRegionServerServices().getServerName(), storeFiles); WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionEventDesc, mvcc); // Store SeqId in WAL FileSystem when a region closes // checking region folder exists is due to many tests which delete the table folder while a // table is still online if (getWalFileSystem().exists(getWALRegionDir())) { WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(), mvcc.getReadPoint()); } } /** * @return True if this region has references. */ public boolean hasReferences() { return stores.values().stream().anyMatch(HStore::hasReferences); } public void blockUpdates() { this.updatesLock.writeLock().lock(); } public void unblockUpdates() { this.updatesLock.writeLock().unlock(); } public HDFSBlocksDistribution getHDFSBlocksDistribution() { HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution(); stores.values().stream().filter(s -> s.getStorefiles() != null) .flatMap(s -> s.getStorefiles().stream()).map(HStoreFile::getHDFSBlockDistribution) .forEachOrdered(hdfsBlocksDistribution::add); return hdfsBlocksDistribution; } /** * This is a helper function to compute HDFS block distribution on demand * @param conf configuration * @param tableDescriptor TableDescriptor of the table * @param regionInfo encoded name of the region * @return The HDFS blocks distribution for the given region. */ public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf, TableDescriptor tableDescriptor, RegionInfo regionInfo) throws IOException { Path tablePath = CommonFSUtils.getTableDir(CommonFSUtils.getRootDir(conf), tableDescriptor.getTableName()); return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath); } /** * This is a helper function to compute HDFS block distribution on demand * @param conf configuration * @param tableDescriptor TableDescriptor of the table * @param regionInfo encoded name of the region * @param tablePath the table directory * @return The HDFS blocks distribution for the given region. * @throws IOException */ public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf, TableDescriptor tableDescriptor, RegionInfo regionInfo, Path tablePath) throws IOException { HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution(); FileSystem fs = tablePath.getFileSystem(conf); HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo); for (ColumnFamilyDescriptor family : tableDescriptor.getColumnFamilies()) { List locatedFileStatusList = HRegionFileSystem .getStoreFilesLocatedStatus(regionFs, family.getNameAsString(), true); if (locatedFileStatusList == null) { continue; } for (LocatedFileStatus status : locatedFileStatusList) { Path p = status.getPath(); if (StoreFileInfo.isReference(p) || HFileLink.isHFileLink(p)) { // Only construct StoreFileInfo object if its not a hfile, save obj // creation StoreFileInfo storeFileInfo = new StoreFileInfo(conf, fs, status); hdfsBlocksDistribution.add(storeFileInfo .computeHDFSBlocksDistribution(fs)); } else if (StoreFileInfo.isHFile(p)) { // If its a HFile, then lets just add to the block distribution // lets not create more objects here, not even another HDFSBlocksDistribution FSUtils.addToHDFSBlocksDistribution(hdfsBlocksDistribution, status.getBlockLocations()); } else { throw new IOException("path=" + p + " doesn't look like a valid StoreFile"); } } } return hdfsBlocksDistribution; } /** * Increase the size of mem store in this region and the size of global mem * store */ void incMemStoreSize(MemStoreSize mss) { incMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(), mss.getCellsCount()); } void incMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta, int cellsCountDelta) { if (this.rsAccounting != null) { rsAccounting.incGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta); } long dataSize = this.memStoreSizing.incMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta, cellsCountDelta); checkNegativeMemStoreDataSize(dataSize, dataSizeDelta); } void decrMemStoreSize(MemStoreSize mss) { decrMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(), mss.getCellsCount()); } void decrMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta, int cellsCountDelta) { if (this.rsAccounting != null) { rsAccounting.decGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta); } long dataSize = this.memStoreSizing.decMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta, cellsCountDelta); checkNegativeMemStoreDataSize(dataSize, -dataSizeDelta); } private void checkNegativeMemStoreDataSize(long memStoreDataSize, long delta) { // This is extremely bad if we make memStoreSizing negative. Log as much info on the offending // caller as possible. (memStoreSizing might be a negative value already -- freeing memory) if (memStoreDataSize < 0) { LOG.error("Asked to modify this region's (" + this.toString() + ") memStoreSizing to a negative value which is incorrect. Current memStoreSizing=" + (memStoreDataSize - delta) + ", delta=" + delta, new Exception()); } } @Override public RegionInfo getRegionInfo() { return this.fs.getRegionInfo(); } /** * @return Instance of {@link RegionServerServices} used by this HRegion. * Can be null. */ RegionServerServices getRegionServerServices() { return this.rsServices; } @Override public long getReadRequestsCount() { return readRequestsCount.sum(); } @Override public long getFilteredReadRequestsCount() { return filteredReadRequestsCount.sum(); } @Override public long getWriteRequestsCount() { return writeRequestsCount.sum(); } @Override public long getMemStoreDataSize() { return memStoreSizing.getDataSize(); } @Override public long getMemStoreHeapSize() { return memStoreSizing.getHeapSize(); } @Override public long getMemStoreOffHeapSize() { return memStoreSizing.getOffHeapSize(); } /** @return store services for this region, to access services required by store level needs */ public RegionServicesForStores getRegionServicesForStores() { return regionServicesForStores; } @Override public long getNumMutationsWithoutWAL() { return numMutationsWithoutWAL.sum(); } @Override public long getDataInMemoryWithoutWAL() { return dataInMemoryWithoutWAL.sum(); } @Override public long getBlockedRequestsCount() { return blockedRequestsCount.sum(); } @Override public long getCheckAndMutateChecksPassed() { return checkAndMutateChecksPassed.sum(); } @Override public long getCheckAndMutateChecksFailed() { return checkAndMutateChecksFailed.sum(); } // TODO Needs to check whether we should expose our metrics system to CPs. If CPs themselves doing // the op and bypassing the core, this might be needed? Should be stop supporting the bypass // feature? public MetricsRegion getMetrics() { return metricsRegion; } @Override public boolean isClosed() { return this.closed.get(); } @Override public boolean isClosing() { return this.closing.get(); } @Override public boolean isReadOnly() { return this.writestate.isReadOnly(); } @Override public boolean isAvailable() { return !isClosed() && !isClosing(); } @Override public boolean isSplittable() { return isAvailable() && !hasReferences(); } @Override public boolean isMergeable() { if (!isAvailable()) { LOG.debug("Region " + this + " is not mergeable because it is closing or closed"); return false; } if (hasReferences()) { LOG.debug("Region " + this + " is not mergeable because it has references"); return false; } return true; } public boolean areWritesEnabled() { synchronized(this.writestate) { return this.writestate.writesEnabled; } } @VisibleForTesting public MultiVersionConcurrencyControl getMVCC() { return mvcc; } @Override public long getMaxFlushedSeqId() { return maxFlushedSeqId; } /** * @return readpoint considering given IsolationLevel. Pass {@code null} for default */ public long getReadPoint(IsolationLevel isolationLevel) { if (isolationLevel != null && isolationLevel == IsolationLevel.READ_UNCOMMITTED) { // This scan can read even uncommitted transactions return Long.MAX_VALUE; } return mvcc.getReadPoint(); } public boolean isLoadingCfsOnDemandDefault() { return this.isLoadingCfsOnDemandDefault; } /** * Close down this HRegion. Flush the cache, shut down each HStore, don't * service any more calls. * *

This method could take some time to execute, so don't call it from a * time-sensitive thread. * * @return Vector of all the storage files that the HRegion's component * HStores make use of. It's a list of all StoreFile objects. Returns empty * vector if already closed and null if judged that it should not close. * * @throws IOException e * @throws DroppedSnapshotException Thrown when replay of wal is required * because a Snapshot was not properly persisted. The region is put in closing mode, and the * caller MUST abort after this. */ public Map> close() throws IOException { return close(false); } private final Object closeLock = new Object(); /** Conf key for fair locking policy */ public static final String FAIR_REENTRANT_CLOSE_LOCK = "hbase.regionserver.fair.region.close.lock"; public static final boolean DEFAULT_FAIR_REENTRANT_CLOSE_LOCK = true; /** Conf key for the periodic flush interval */ public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL = "hbase.regionserver.optionalcacheflushinterval"; /** Default interval for the memstore flush */ public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000; /** Default interval for System tables memstore flush */ public static final int SYSTEM_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes /** Conf key to force a flush if there are already enough changes for one region in memstore */ public static final String MEMSTORE_FLUSH_PER_CHANGES = "hbase.regionserver.flush.per.changes"; public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions /** * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region */ public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G /** * Close down this HRegion. Flush the cache unless abort parameter is true, * Shut down each HStore, don't service any more calls. * * This method could take some time to execute, so don't call it from a * time-sensitive thread. * * @param abort true if server is aborting (only during testing) * @return Vector of all the storage files that the HRegion's component * HStores make use of. It's a list of StoreFile objects. Can be null if * we are not to close at this time or we are already closed. * * @throws IOException e * @throws DroppedSnapshotException Thrown when replay of wal is required * because a Snapshot was not properly persisted. The region is put in closing mode, and the * caller MUST abort after this. */ public Map> close(boolean abort) throws IOException { // Only allow one thread to close at a time. Serialize them so dual // threads attempting to close will run up against each other. MonitoredTask status = TaskMonitor.get().createStatus( "Closing region " + this.getRegionInfo().getEncodedName() + (abort ? " due to abort" : "")); status.enableStatusJournal(true); status.setStatus("Waiting for close lock"); try { synchronized (closeLock) { return doClose(abort, status); } } finally { if (LOG.isDebugEnabled()) { LOG.debug("Region close journal for {}:\n{}", this.getRegionInfo().getEncodedName(), status.prettyPrintJournal()); } status.cleanup(); } } /** * Exposed for some very specific unit tests. */ @VisibleForTesting public void setClosing(boolean closing) { this.closing.set(closing); } /** * The {@link HRegion#doClose} will block forever if someone tries proving the dead lock via the unit test. * Instead of blocking, the {@link HRegion#doClose} will throw exception if you set the timeout. * @param timeoutForWriteLock the second time to wait for the write lock in {@link HRegion#doClose} */ @VisibleForTesting public void setTimeoutForWriteLock(long timeoutForWriteLock) { assert timeoutForWriteLock >= 0; this.timeoutForWriteLock = timeoutForWriteLock; } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK_EXCEPTION_PATH", justification="I think FindBugs is confused") private Map> doClose(boolean abort, MonitoredTask status) throws IOException { if (isClosed()) { LOG.warn("Region " + this + " already closed"); return null; } if (coprocessorHost != null) { status.setStatus("Running coprocessor pre-close hooks"); this.coprocessorHost.preClose(abort); } status.setStatus("Disabling compacts and flushes for region"); boolean canFlush = true; synchronized (writestate) { // Disable compacting and flushing by background threads for this // region. canFlush = !writestate.readOnly; writestate.writesEnabled = false; LOG.debug("Closing {}, disabling compactions & flushes", this.getRegionInfo().getEncodedName()); waitForFlushesAndCompactions(); } // If we were not just flushing, is it worth doing a preflush...one // that will clear out of the bulk of the memstore before we put up // the close flag? if (!abort && worthPreFlushing() && canFlush) { status.setStatus("Pre-flushing region before close"); LOG.info("Running close preflush of {}", this.getRegionInfo().getEncodedName()); try { internalFlushcache(status); } catch (IOException ioe) { // Failed to flush the region. Keep going. status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage()); } } if (timeoutForWriteLock == null || timeoutForWriteLock == Long.MAX_VALUE) { // block waiting for the lock for closing lock.writeLock().lock(); // FindBugs: Complains UL_UNRELEASED_LOCK_EXCEPTION_PATH but seems fine } else { try { boolean succeed = lock.writeLock().tryLock(timeoutForWriteLock, TimeUnit.SECONDS); if (!succeed) { throw new IOException("Failed to get write lock when closing region"); } } catch (InterruptedException e) { throw (InterruptedIOException) new InterruptedIOException().initCause(e); } } this.closing.set(true); LOG.info("Closing region {}", this); status.setStatus("Disabling writes for close"); try { if (this.isClosed()) { status.abort("Already got closed by another process"); // SplitTransaction handles the null return null; } LOG.debug("Updates disabled for region " + this); // Don't flush the cache if we are aborting if (!abort && canFlush) { int failedfFlushCount = 0; int flushCount = 0; long tmp = 0; long remainingSize = this.memStoreSizing.getDataSize(); while (remainingSize > 0) { try { internalFlushcache(status); if(flushCount >0) { LOG.info("Running extra flush, " + flushCount + " (carrying snapshot?) " + this); } flushCount++; tmp = this.memStoreSizing.getDataSize(); if (tmp >= remainingSize) { failedfFlushCount++; } remainingSize = tmp; if (failedfFlushCount > 5) { // If we failed 5 times and are unable to clear memory, abort // so we do not lose data throw new DroppedSnapshotException("Failed clearing memory after " + flushCount + " attempts on region: " + Bytes.toStringBinary(getRegionInfo().getRegionName())); } } catch (IOException ioe) { status.setStatus("Failed flush " + this + ", putting online again"); synchronized (writestate) { writestate.writesEnabled = true; } // Have to throw to upper layers. I can't abort server from here. throw ioe; } } } Map> result = new TreeMap<>(Bytes.BYTES_COMPARATOR); if (!stores.isEmpty()) { // initialize the thread pool for closing stores in parallel. ThreadPoolExecutor storeCloserThreadPool = getStoreOpenAndCloseThreadPool("StoreCloser-" + getRegionInfo().getRegionNameAsString()); CompletionService>> completionService = new ExecutorCompletionService<>(storeCloserThreadPool); // close each store in parallel for (HStore store : stores.values()) { MemStoreSize mss = store.getFlushableSize(); if (!(abort || mss.getDataSize() == 0 || writestate.readOnly)) { if (getRegionServerServices() != null) { getRegionServerServices().abort("Assertion failed while closing store " + getRegionInfo().getRegionNameAsString() + " " + store + ". flushableSize expected=0, actual={" + mss + "}. Current memStoreSize=" + this.memStoreSizing.getMemStoreSize() + ". Maybe a coprocessor " + "operation failed and left the memstore in a partially updated state.", null); } } completionService .submit(new Callable>>() { @Override public Pair> call() throws IOException { return new Pair<>(store.getColumnFamilyDescriptor().getName(), store.close()); } }); } try { for (int i = 0; i < stores.size(); i++) { Future>> future = completionService.take(); Pair> storeFiles = future.get(); List familyFiles = result.get(storeFiles.getFirst()); if (familyFiles == null) { familyFiles = new ArrayList<>(); result.put(storeFiles.getFirst(), familyFiles); } familyFiles.addAll(storeFiles.getSecond()); } } catch (InterruptedException e) { throw (InterruptedIOException)new InterruptedIOException().initCause(e); } catch (ExecutionException e) { Throwable cause = e.getCause(); if (cause instanceof IOException) { throw (IOException) cause; } throw new IOException(cause); } finally { storeCloserThreadPool.shutdownNow(); } } status.setStatus("Writing region close event to WAL"); // Always write close marker to wal even for read only table. This is not a big problem as we // do not write any data into the region; it is just a meta edit in the WAL file. if (!abort && wal != null && getRegionServerServices() != null && RegionReplicaUtil.isDefaultReplica(getRegionInfo())) { writeRegionCloseMarker(wal); } this.closed.set(true); if (!canFlush) { decrMemStoreSize(this.memStoreSizing.getMemStoreSize()); } else if (this.memStoreSizing.getDataSize() != 0) { LOG.error("Memstore data size is {} in region {}", this.memStoreSizing.getDataSize(), this); } if (coprocessorHost != null) { status.setStatus("Running coprocessor post-close hooks"); this.coprocessorHost.postClose(abort); } if (this.metricsRegion != null) { this.metricsRegion.close(); } if (this.metricsRegionWrapper != null) { Closeables.close(this.metricsRegionWrapper, true); } status.markComplete("Closed"); LOG.info("Closed {}", this); return result; } finally { lock.writeLock().unlock(); } } /** Wait for all current flushes and compactions of the region to complete */ // TODO HBASE-18906. Check the usage (if any) in Phoenix and expose this or give alternate way for // Phoenix needs. public void waitForFlushesAndCompactions() { synchronized (writestate) { if (this.writestate.readOnly) { // we should not wait for replayed flushed if we are read only (for example in case the // region is a secondary replica). return; } boolean interrupted = false; try { while (writestate.compacting.get() > 0 || writestate.flushing) { LOG.debug("waiting for " + writestate.compacting + " compactions" + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this); try { writestate.wait(); } catch (InterruptedException iex) { // essentially ignore and propagate the interrupt back up LOG.warn("Interrupted while waiting in region {}", this); interrupted = true; break; } } } finally { if (interrupted) { Thread.currentThread().interrupt(); } } } } /** * Wait for all current flushes of the region to complete */ public void waitForFlushes() { waitForFlushes(0);// Unbound wait } @Override public boolean waitForFlushes(long timeout) { synchronized (writestate) { if (this.writestate.readOnly) { // we should not wait for replayed flushed if we are read only (for example in case the // region is a secondary replica). return true; } if (!writestate.flushing) return true; long start = System.currentTimeMillis(); long duration = 0; boolean interrupted = false; LOG.debug("waiting for cache flush to complete for region " + this); try { while (writestate.flushing) { if (timeout > 0 && duration >= timeout) break; try { long toWait = timeout == 0 ? 0 : (timeout - duration); writestate.wait(toWait); } catch (InterruptedException iex) { // essentially ignore and propagate the interrupt back up LOG.warn("Interrupted while waiting in region {}", this); interrupted = true; break; } finally { duration = System.currentTimeMillis() - start; } } } finally { if (interrupted) { Thread.currentThread().interrupt(); } } LOG.debug("Waited {} ms for region {} flush to complete", duration, this); return !(writestate.flushing); } } @Override public Configuration getReadOnlyConfiguration() { return new ReadOnlyConfiguration(this.conf); } protected ThreadPoolExecutor getStoreOpenAndCloseThreadPool( final String threadNamePrefix) { int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount()); int maxThreads = Math.min(numStores, conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX, HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)); return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix); } protected ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool( final String threadNamePrefix) { int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount()); int maxThreads = Math.max(1, conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX, HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX) / numStores); return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix); } static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads, final String threadNamePrefix) { return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS, new ThreadFactory() { private int count = 1; @Override public Thread newThread(Runnable r) { return new Thread(r, threadNamePrefix + "-" + count++); } }); } /** * @return True if its worth doing a flush before we put up the close flag. */ private boolean worthPreFlushing() { return this.memStoreSizing.getDataSize() > this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5); } ////////////////////////////////////////////////////////////////////////////// // HRegion accessors ////////////////////////////////////////////////////////////////////////////// @Override public TableDescriptor getTableDescriptor() { return this.htableDescriptor; } @VisibleForTesting void setTableDescriptor(TableDescriptor desc) { htableDescriptor = desc; } /** @return WAL in use for this region */ public WAL getWAL() { return this.wal; } public BlockCache getBlockCache() { return this.blockCache; } /** * Only used for unit test which doesn't start region server. */ @VisibleForTesting public void setBlockCache(BlockCache blockCache) { this.blockCache = blockCache; } public MobFileCache getMobFileCache() { return this.mobFileCache; } /** * Only used for unit test which doesn't start region server. */ @VisibleForTesting public void setMobFileCache(MobFileCache mobFileCache) { this.mobFileCache = mobFileCache; } /** * @return split policy for this region. */ public RegionSplitPolicy getSplitPolicy() { return this.splitPolicy; } /** * A split takes the config from the parent region & passes it to the daughter * region's constructor. If 'conf' was passed, you would end up using the HTD * of the parent region in addition to the new daughter HTD. Pass 'baseConf' * to the daughter regions to avoid this tricky dedupe problem. * @return Configuration object */ Configuration getBaseConf() { return this.baseConf; } /** @return {@link FileSystem} being used by this region */ public FileSystem getFilesystem() { return fs.getFileSystem(); } /** @return the {@link HRegionFileSystem} used by this region */ public HRegionFileSystem getRegionFileSystem() { return this.fs; } /** @return the WAL {@link HRegionFileSystem} used by this region */ HRegionWALFileSystem getRegionWALFileSystem() throws IOException { return new HRegionWALFileSystem(conf, getWalFileSystem(), CommonFSUtils.getWALTableDir(conf, htableDescriptor.getTableName()), fs.getRegionInfo()); } /** @return the WAL {@link FileSystem} being used by this region */ FileSystem getWalFileSystem() throws IOException { if (walFS == null) { walFS = CommonFSUtils.getWALFileSystem(conf); } return walFS; } /** * @return the Region directory under WALRootDirectory * @throws IOException if there is an error getting WALRootDir */ @VisibleForTesting public Path getWALRegionDir() throws IOException { if (regionDir == null) { regionDir = CommonFSUtils.getWALRegionDir(conf, getRegionInfo().getTable(), getRegionInfo().getEncodedName()); } return regionDir; } @Override public long getEarliestFlushTimeForAllStores() { return Collections.min(lastStoreFlushTimeMap.values()); } @Override public long getOldestHfileTs(boolean majorCompactionOnly) throws IOException { long result = Long.MAX_VALUE; for (HStore store : stores.values()) { Collection storeFiles = store.getStorefiles(); if (storeFiles == null) { continue; } for (HStoreFile file : storeFiles) { StoreFileReader sfReader = file.getReader(); if (sfReader == null) { continue; } HFile.Reader reader = sfReader.getHFileReader(); if (reader == null) { continue; } if (majorCompactionOnly) { byte[] val = reader.getHFileInfo().get(MAJOR_COMPACTION_KEY); if (val == null || !Bytes.toBoolean(val)) { continue; } } result = Math.min(result, reader.getFileContext().getFileCreateTime()); } } return result == Long.MAX_VALUE ? 0 : result; } RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) { long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId; byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes(); regionLoadBldr.clearStoreCompleteSequenceId(); for (byte[] familyName : this.stores.keySet()) { long earliest = this.wal.getEarliestMemStoreSeqNum(encodedRegionName, familyName); // Subtract - 1 to go earlier than the current oldest, unflushed edit in memstore; this will // give us a sequence id that is for sure flushed. We want edit replay to start after this // sequence id in this region. If NO_SEQNUM, use the regions maximum flush id. long csid = (earliest == HConstants.NO_SEQNUM)? lastFlushOpSeqIdLocal: earliest - 1; regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId.newBuilder() .setFamilyName(UnsafeByteOperations.unsafeWrap(familyName)).setSequenceId(csid).build()); } return regionLoadBldr.setCompleteSequenceId(getMaxFlushedSeqId()); } ////////////////////////////////////////////////////////////////////////////// // HRegion maintenance. // // These methods are meant to be called periodically by the HRegionServer for // upkeep. ////////////////////////////////////////////////////////////////////////////// /** * Do preparation for pending compaction. * @throws IOException */ protected void doRegionCompactionPrep() throws IOException { } /** * Synchronously compact all stores in the region. *

This operation could block for a long time, so don't call it from a * time-sensitive thread. *

Note that no locks are taken to prevent possible conflicts between * compaction and splitting activities. The regionserver does not normally compact * and split in parallel. However by calling this method you may introduce * unexpected and unhandled concurrency. Don't do this unless you know what * you are doing. * * @param majorCompaction True to force a major compaction regardless of thresholds * @throws IOException */ public void compact(boolean majorCompaction) throws IOException { if (majorCompaction) { stores.values().forEach(HStore::triggerMajorCompaction); } for (HStore s : stores.values()) { Optional compaction = s.requestCompaction(); if (compaction.isPresent()) { ThroughputController controller = null; if (rsServices != null) { controller = CompactionThroughputControllerFactory.create(rsServices, conf); } if (controller == null) { controller = NoLimitThroughputController.INSTANCE; } compact(compaction.get(), s, controller, null); } } } /** * This is a helper function that compact all the stores synchronously. *

* It is used by utilities and testing */ @VisibleForTesting public void compactStores() throws IOException { for (HStore s : stores.values()) { Optional compaction = s.requestCompaction(); if (compaction.isPresent()) { compact(compaction.get(), s, NoLimitThroughputController.INSTANCE, null); } } } /** * This is a helper function that compact the given store. *

* It is used by utilities and testing */ @VisibleForTesting void compactStore(byte[] family, ThroughputController throughputController) throws IOException { HStore s = getStore(family); Optional compaction = s.requestCompaction(); if (compaction.isPresent()) { compact(compaction.get(), s, throughputController, null); } } /** * Called by compaction thread and after region is opened to compact the * HStores if necessary. * *

This operation could block for a long time, so don't call it from a * time-sensitive thread. * * Note that no locking is necessary at this level because compaction only * conflicts with a region split, and that cannot happen because the region * server does them sequentially and not in parallel. * * @param compaction Compaction details, obtained by requestCompaction() * @param throughputController * @return whether the compaction completed */ public boolean compact(CompactionContext compaction, HStore store, ThroughputController throughputController) throws IOException { return compact(compaction, store, throughputController, null); } public boolean compact(CompactionContext compaction, HStore store, ThroughputController throughputController, User user) throws IOException { assert compaction != null && compaction.hasSelection(); assert !compaction.getRequest().getFiles().isEmpty(); if (this.closing.get() || this.closed.get()) { LOG.debug("Skipping compaction on " + this + " because closing/closed"); store.cancelRequestedCompaction(compaction); return false; } MonitoredTask status = null; boolean requestNeedsCancellation = true; /* * We are trying to remove / relax the region read lock for compaction. * Let's see what are the potential race conditions among the operations (user scan, * region split, region close and region bulk load). * * user scan ---> region read lock * region split --> region close first --> region write lock * region close --> region write lock * region bulk load --> region write lock * * read lock is compatible with read lock. ---> no problem with user scan/read * region bulk load does not cause problem for compaction (no consistency problem, store lock * will help the store file accounting). * They can run almost concurrently at the region level. * * The only remaining race condition is between the region close and compaction. * So we will evaluate, below, how region close intervenes with compaction if compaction does * not acquire region read lock. * * Here are the steps for compaction: * 1. obtain list of StoreFile's * 2. create StoreFileScanner's based on list from #1 * 3. perform compaction and save resulting files under tmp dir * 4. swap in compacted files * * #1 is guarded by store lock. This patch does not change this --> no worse or better * For #2, we obtain smallest read point (for region) across all the Scanners (for both default * compactor and stripe compactor). * The read points are for user scans. Region keeps the read points for all currently open * user scanners. * Compaction needs to know the smallest read point so that during re-write of the hfiles, * it can remove the mvcc points for the cells if their mvccs are older than the smallest * since they are not needed anymore. * This will not conflict with compaction. * For #3, it can be performed in parallel to other operations. * For #4 bulk load and compaction don't conflict with each other on the region level * (for multi-family atomicy). * Region close and compaction are guarded pretty well by the 'writestate'. * In HRegion#doClose(), we have : * synchronized (writestate) { * // Disable compacting and flushing by background threads for this * // region. * canFlush = !writestate.readOnly; * writestate.writesEnabled = false; * LOG.debug("Closing " + this + ": disabling compactions & flushes"); * waitForFlushesAndCompactions(); * } * waitForFlushesAndCompactions() would wait for writestate.compacting to come down to 0. * and in HRegion.compact() * try { * synchronized (writestate) { * if (writestate.writesEnabled) { * wasStateSet = true; * ++writestate.compacting; * } else { * String msg = "NOT compacting region " + this + ". Writes disabled."; * LOG.info(msg); * status.abort(msg); * return false; * } * } * Also in compactor.performCompaction(): * check periodically to see if a system stop is requested * if (closeCheckInterval > 0) { * bytesWritten += len; * if (bytesWritten > closeCheckInterval) { * bytesWritten = 0; * if (!store.areWritesEnabled()) { * progress.cancel(); * return false; * } * } * } */ try { byte[] cf = Bytes.toBytes(store.getColumnFamilyName()); if (stores.get(cf) != store) { LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this + " has been re-instantiated, cancel this compaction request. " + " It may be caused by the roll back of split transaction"); return false; } status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this); status.enableStatusJournal(false); if (this.closed.get()) { String msg = "Skipping compaction on " + this + " because closed"; LOG.debug(msg); status.abort(msg); return false; } boolean wasStateSet = false; try { synchronized (writestate) { if (writestate.writesEnabled) { wasStateSet = true; writestate.compacting.incrementAndGet(); } else { String msg = "NOT compacting region " + this + ". Writes disabled."; LOG.info(msg); status.abort(msg); return false; } } LOG.info("Starting compaction of {} in {}{}", store, this, (compaction.getRequest().isOffPeak()?" as an off-peak compaction":"")); doRegionCompactionPrep(); try { status.setStatus("Compacting store " + store); // We no longer need to cancel the request on the way out of this // method because Store#compact will clean up unconditionally requestNeedsCancellation = false; store.compact(compaction, throughputController, user); } catch (InterruptedIOException iioe) { String msg = "region " + this + " compaction interrupted"; LOG.info(msg, iioe); status.abort(msg); return false; } } finally { if (wasStateSet) { synchronized (writestate) { writestate.compacting.decrementAndGet(); if (writestate.compacting.get() <= 0) { writestate.notifyAll(); } } } } status.markComplete("Compaction complete"); return true; } finally { if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction); if (status != null) { LOG.debug("Compaction status journal for {}:\n{}", this.getRegionInfo().getEncodedName(), status.prettyPrintJournal()); status.cleanup(); } } } /** * Flush the cache. * *

When this method is called the cache will be flushed unless: *

    *
  1. the cache is empty
  2. *
  3. the region is closed.
  4. *
  5. a flush is already in progress
  6. *
  7. writes are disabled
  8. *
* *

This method may block for some time, so it should not be called from a * time-sensitive thread. * @param force whether we want to force a flush of all stores * @return FlushResult indicating whether the flush was successful or not and if * the region needs compacting * * @throws IOException general io exceptions * because a snapshot was not properly persisted. */ // TODO HBASE-18905. We might have to expose a requestFlush API for CPs public FlushResult flush(boolean force) throws IOException { return flushcache(force, false, FlushLifeCycleTracker.DUMMY); } public interface FlushResult { enum Result { FLUSHED_NO_COMPACTION_NEEDED, FLUSHED_COMPACTION_NEEDED, // Special case where a flush didn't run because there's nothing in the memstores. Used when // bulk loading to know when we can still load even if a flush didn't happen. CANNOT_FLUSH_MEMSTORE_EMPTY, CANNOT_FLUSH } /** @return the detailed result code */ Result getResult(); /** @return true if the memstores were flushed, else false */ boolean isFlushSucceeded(); /** @return True if the flush requested a compaction, else false */ boolean isCompactionNeeded(); } /** * Flush the cache. * * When this method is called the cache will be flushed unless: *

    *
  1. the cache is empty
  2. *
  3. the region is closed.
  4. *
  5. a flush is already in progress
  6. *
  7. writes are disabled
  8. *
* *

This method may block for some time, so it should not be called from a * time-sensitive thread. * @param forceFlushAllStores whether we want to flush all stores * @param writeFlushRequestWalMarker whether to write the flush request marker to WAL * @param tracker used to track the life cycle of this flush * @return whether the flush is success and whether the region needs compacting * * @throws IOException general io exceptions * @throws DroppedSnapshotException Thrown when replay of wal is required * because a Snapshot was not properly persisted. The region is put in closing mode, and the * caller MUST abort after this. */ public FlushResultImpl flushcache(boolean forceFlushAllStores, boolean writeFlushRequestWalMarker, FlushLifeCycleTracker tracker) throws IOException { // fail-fast instead of waiting on the lock if (this.closing.get()) { String msg = "Skipping flush on " + this + " because closing"; LOG.debug(msg); return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); } MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this); status.enableStatusJournal(false); status.setStatus("Acquiring readlock on region"); // block waiting for the lock for flushing cache lock.readLock().lock(); try { if (this.closed.get()) { String msg = "Skipping flush on " + this + " because closed"; LOG.debug(msg); status.abort(msg); return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); } if (coprocessorHost != null) { status.setStatus("Running coprocessor pre-flush hooks"); coprocessorHost.preFlush(tracker); } // TODO: this should be managed within memstore with the snapshot, updated only after flush // successful if (numMutationsWithoutWAL.sum() > 0) { numMutationsWithoutWAL.reset(); dataInMemoryWithoutWAL.reset(); } synchronized (writestate) { if (!writestate.flushing && writestate.writesEnabled) { this.writestate.flushing = true; } else { if (LOG.isDebugEnabled()) { LOG.debug("NOT flushing memstore for region " + this + ", flushing=" + writestate.flushing + ", writesEnabled=" + writestate.writesEnabled); } String msg = "Not flushing since " + (writestate.flushing ? "already flushing" : "writes not enabled"); status.abort(msg); return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); } } try { Collection specificStoresToFlush = forceFlushAllStores ? stores.values() : flushPolicy.selectStoresToFlush(); FlushResultImpl fs = internalFlushcache(specificStoresToFlush, status, writeFlushRequestWalMarker, tracker); if (coprocessorHost != null) { status.setStatus("Running post-flush coprocessor hooks"); coprocessorHost.postFlush(tracker); } if(fs.isFlushSucceeded()) { flushesQueued.reset(); } status.markComplete("Flush successful " + fs.toString()); return fs; } finally { synchronized (writestate) { writestate.flushing = false; this.writestate.flushRequested = false; writestate.notifyAll(); } } } finally { lock.readLock().unlock(); LOG.debug("Flush status journal for {}:\n{}", this.getRegionInfo().getEncodedName(), status.prettyPrintJournal()); status.cleanup(); } } /** * Should the store be flushed because it is old enough. *

* Every FlushPolicy should call this to determine whether a store is old enough to flush (except * that you always flush all stores). Otherwise the method will always * returns true which will make a lot of flush requests. */ boolean shouldFlushStore(HStore store) { long earliest = this.wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), store.getColumnFamilyDescriptor().getName()) - 1; if (earliest > 0 && earliest + flushPerChanges < mvcc.getReadPoint()) { if (LOG.isDebugEnabled()) { LOG.debug("Flush column family " + store.getColumnFamilyName() + " of " + getRegionInfo().getEncodedName() + " because unflushed sequenceid=" + earliest + " is > " + this.flushPerChanges + " from current=" + mvcc.getReadPoint()); } return true; } if (this.flushCheckInterval <= 0) { return false; } long now = EnvironmentEdgeManager.currentTime(); if (store.timeOfOldestEdit() < now - this.flushCheckInterval) { if (LOG.isDebugEnabled()) { LOG.debug("Flush column family: " + store.getColumnFamilyName() + " of " + getRegionInfo().getEncodedName() + " because time of oldest edit=" + store.timeOfOldestEdit() + " is > " + this.flushCheckInterval + " from now =" + now); } return true; } return false; } /** * Should the memstore be flushed now */ boolean shouldFlush(final StringBuilder whyFlush) { whyFlush.setLength(0); // This is a rough measure. if (this.maxFlushedSeqId > 0 && (this.maxFlushedSeqId + this.flushPerChanges < this.mvcc.getReadPoint())) { whyFlush.append("more than max edits, " + this.flushPerChanges + ", since last flush"); return true; } long modifiedFlushCheckInterval = flushCheckInterval; if (getRegionInfo().getTable().isSystemTable() && getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID) { modifiedFlushCheckInterval = SYSTEM_CACHE_FLUSH_INTERVAL; } if (modifiedFlushCheckInterval <= 0) { //disabled return false; } long now = EnvironmentEdgeManager.currentTime(); //if we flushed in the recent past, we don't need to do again now if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) { return false; } //since we didn't flush in the recent past, flush now if certain conditions //are met. Return true on first such memstore hit. for (HStore s : stores.values()) { if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) { // we have an old enough edit in the memstore, flush whyFlush.append(s.toString() + " has an old edit so flush to free WALs"); return true; } } return false; } /** * Flushing all stores. * @see #internalFlushcache(Collection, MonitoredTask, boolean, FlushLifeCycleTracker) */ private FlushResult internalFlushcache(MonitoredTask status) throws IOException { return internalFlushcache(stores.values(), status, false, FlushLifeCycleTracker.DUMMY); } /** * Flushing given stores. * @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean, FlushLifeCycleTracker) */ private FlushResultImpl internalFlushcache(Collection storesToFlush, MonitoredTask status, boolean writeFlushWalMarker, FlushLifeCycleTracker tracker) throws IOException { return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush, status, writeFlushWalMarker, tracker); } /** * Flush the memstore. Flushing the memstore is a little tricky. We have a lot of updates in the * memstore, all of which have also been written to the wal. We need to write those updates in the * memstore out to disk, while being able to process reads/writes as much as possible during the * flush operation. *

* This method may block for some time. Every time you call it, we up the regions sequence id even * if we don't flush; i.e. the returned region id will be at least one larger than the last edit * applied to this region. The returned id does not refer to an actual edit. The returned id can * be used for say installing a bulk loaded file just ahead of the last hfile that was the result * of this flush, etc. * @param wal Null if we're NOT to go via wal. * @param myseqid The seqid to use if wal is null writing out flush file. * @param storesToFlush The list of stores to flush. * @return object describing the flush's state * @throws IOException general io exceptions * @throws DroppedSnapshotException Thrown when replay of WAL is required. */ protected FlushResultImpl internalFlushcache(WAL wal, long myseqid, Collection storesToFlush, MonitoredTask status, boolean writeFlushWalMarker, FlushLifeCycleTracker tracker) throws IOException { PrepareFlushResult result = internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker, tracker); if (result.result == null) { return internalFlushCacheAndCommit(wal, status, result, storesToFlush); } else { return result.result; // early exit due to failure from prepare stage } } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="DLS_DEAD_LOCAL_STORE", justification="FindBugs seems confused about trxId") protected PrepareFlushResult internalPrepareFlushCache(WAL wal, long myseqid, Collection storesToFlush, MonitoredTask status, boolean writeFlushWalMarker, FlushLifeCycleTracker tracker) throws IOException { if (this.rsServices != null && this.rsServices.isAborted()) { // Don't flush when server aborting, it's unsafe throw new IOException("Aborting flush because server is aborted..."); } final long startTime = EnvironmentEdgeManager.currentTime(); // If nothing to flush, return, but return with a valid unused sequenceId. // Its needed by bulk upload IIRC. It flushes until no edits in memory so it can insert a // bulk loaded file between memory and existing hfiles. It wants a good seqeunceId that belongs // to no other that it can use to associate with the bulk load. Hence this little dance below // to go get one. if (this.memStoreSizing.getDataSize() <= 0) { // Take an update lock so no edits can come into memory just yet. this.updatesLock.writeLock().lock(); WriteEntry writeEntry = null; try { if (this.memStoreSizing.getDataSize() <= 0) { // Presume that if there are still no edits in the memstore, then there are no edits for // this region out in the WAL subsystem so no need to do any trickery clearing out // edits in the WAL sub-system. Up the sequence number so the resulting flush id is for // sure just beyond the last appended region edit and not associated with any edit // (useful as marker when bulk loading, etc.). if (wal != null) { writeEntry = mvcc.begin(); long flushOpSeqId = writeEntry.getWriteNumber(); FlushResultImpl flushResult = new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, flushOpSeqId, "Nothing to flush", writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker)); mvcc.completeAndWait(writeEntry); // Set to null so we don't complete it again down in finally block. writeEntry = null; return new PrepareFlushResult(flushResult, myseqid); } else { return new PrepareFlushResult(new FlushResultImpl( FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush", false), myseqid); } } } finally { if (writeEntry != null) { // If writeEntry is non-null, this operation failed; the mvcc transaction failed... // but complete it anyways so it doesn't block the mvcc queue. mvcc.complete(writeEntry); } this.updatesLock.writeLock().unlock(); } } logFatLineOnFlush(storesToFlush, myseqid); // Stop updates while we snapshot the memstore of all of these regions' stores. We only have // to do this for a moment. It is quick. We also set the memstore size to zero here before we // allow updates again so its value will represent the size of the updates received // during flush // We have to take an update lock during snapshot, or else a write could end up in both snapshot // and memstore (makes it difficult to do atomic rows then) status.setStatus("Obtaining lock to block concurrent updates"); // block waiting for the lock for internal flush this.updatesLock.writeLock().lock(); status.setStatus("Preparing flush snapshotting stores in " + getRegionInfo().getEncodedName()); MemStoreSizing totalSizeOfFlushableStores = new NonThreadSafeMemStoreSizing(); Map flushedFamilyNamesToSeq = new HashMap<>(); for (HStore store : storesToFlush) { flushedFamilyNamesToSeq.put(store.getColumnFamilyDescriptor().getName(), store.preFlushSeqIDEstimation()); } TreeMap storeFlushCtxs = new TreeMap<>(Bytes.BYTES_COMPARATOR); TreeMap> committedFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); TreeMap storeFlushableSize = new TreeMap<>(Bytes.BYTES_COMPARATOR); // The sequence id of this flush operation which is used to log FlushMarker and pass to // createFlushContext to use as the store file's sequence id. It can be in advance of edits // still in the memstore, edits that are in other column families yet to be flushed. long flushOpSeqId = HConstants.NO_SEQNUM; // The max flushed sequence id after this flush operation completes. All edits in memstore // will be in advance of this sequence id. long flushedSeqId = HConstants.NO_SEQNUM; byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes(); try { if (wal != null) { Long earliestUnflushedSequenceIdForTheRegion = wal.startCacheFlush(encodedRegionName, flushedFamilyNamesToSeq); if (earliestUnflushedSequenceIdForTheRegion == null) { // This should never happen. This is how startCacheFlush signals flush cannot proceed. String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing."; status.setStatus(msg); return new PrepareFlushResult( new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false), myseqid); } flushOpSeqId = getNextSequenceId(wal); // Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit flushedSeqId = earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM? flushOpSeqId: earliestUnflushedSequenceIdForTheRegion.longValue() - 1; } else { // use the provided sequence Id as WAL is not being used for this flush. flushedSeqId = flushOpSeqId = myseqid; } for (HStore s : storesToFlush) { storeFlushCtxs.put(s.getColumnFamilyDescriptor().getName(), s.createFlushContext(flushOpSeqId, tracker)); // for writing stores to WAL committedFiles.put(s.getColumnFamilyDescriptor().getName(), null); } // write the snapshot start to WAL if (wal != null && !writestate.readOnly) { FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH, getRegionInfo(), flushOpSeqId, committedFiles); // No sync. Sync is below where no updates lock and we do FlushAction.COMMIT_FLUSH WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false, mvcc); } // Prepare flush (take a snapshot) storeFlushCtxs.forEach((name, flush) -> { MemStoreSize snapshotSize = flush.prepare(); totalSizeOfFlushableStores.incMemStoreSize(snapshotSize); storeFlushableSize.put(name, snapshotSize); }); } catch (IOException ex) { doAbortFlushToWAL(wal, flushOpSeqId, committedFiles); throw ex; } finally { this.updatesLock.writeLock().unlock(); } String s = "Finished memstore snapshotting " + this + ", syncing WAL and waiting on mvcc, " + "flushsize=" + totalSizeOfFlushableStores; status.setStatus(s); doSyncOfUnflushedWALChanges(wal, getRegionInfo()); return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime, flushOpSeqId, flushedSeqId, totalSizeOfFlushableStores); } /** * Utility method broken out of internalPrepareFlushCache so that method is smaller. */ private void logFatLineOnFlush(Collection storesToFlush, long sequenceId) { if (!LOG.isInfoEnabled()) { return; } // Log a fat line detailing what is being flushed. StringBuilder perCfExtras = null; if (!isAllFamilies(storesToFlush)) { perCfExtras = new StringBuilder(); for (HStore store: storesToFlush) { MemStoreSize mss = store.getFlushableSize(); perCfExtras.append("; ").append(store.getColumnFamilyName()); perCfExtras.append("={dataSize=") .append(StringUtils.byteDesc(mss.getDataSize())); perCfExtras.append(", heapSize=") .append(StringUtils.byteDesc(mss.getHeapSize())); perCfExtras.append(", offHeapSize=") .append(StringUtils.byteDesc(mss.getOffHeapSize())); perCfExtras.append("}"); } } MemStoreSize mss = this.memStoreSizing.getMemStoreSize(); LOG.info("Flushing " + this.getRegionInfo().getEncodedName() + " " + storesToFlush.size() + "/" + stores.size() + " column families," + " dataSize=" + StringUtils.byteDesc(mss.getDataSize()) + " heapSize=" + StringUtils.byteDesc(mss.getHeapSize()) + ((perCfExtras != null && perCfExtras.length() > 0)? perCfExtras.toString(): "") + ((wal != null) ? "" : "; WAL is null, using passed sequenceid=" + sequenceId)); } private void doAbortFlushToWAL(final WAL wal, final long flushOpSeqId, final Map> committedFiles) { if (wal == null) return; try { FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH, getRegionInfo(), flushOpSeqId, committedFiles); WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false, mvcc); } catch (Throwable t) { LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL: {} in " + " region {}", StringUtils.stringifyException(t), this); // ignore this since we will be aborting the RS with DSE. } // we have called wal.startCacheFlush(), now we have to abort it wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); } /** * Sync unflushed WAL changes. See HBASE-8208 for details */ private static void doSyncOfUnflushedWALChanges(final WAL wal, final RegionInfo hri) throws IOException { if (wal == null) { return; } try { wal.sync(); // ensure that flush marker is sync'ed } catch (IOException ioe) { wal.abortCacheFlush(hri.getEncodedNameAsBytes()); throw ioe; } } /** * @return True if passed Set is all families in the region. */ private boolean isAllFamilies(Collection families) { return families == null || this.stores.size() == families.size(); } /** * Writes a marker to WAL indicating a flush is requested but cannot be complete due to various * reasons. Ignores exceptions from WAL. Returns whether the write succeeded. * @param wal * @return whether WAL write was successful */ private boolean writeFlushRequestMarkerToWAL(WAL wal, boolean writeFlushWalMarker) { if (writeFlushWalMarker && wal != null && !writestate.readOnly) { FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH, getRegionInfo(), -1, new TreeMap<>(Bytes.BYTES_COMPARATOR)); try { WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true, mvcc); return true; } catch (IOException e) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received exception while trying to write the flush request to wal", e); } } return false; } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY", justification="Intentional; notify is about completed flush") protected FlushResultImpl internalFlushCacheAndCommit(WAL wal, MonitoredTask status, PrepareFlushResult prepareResult, Collection storesToFlush) throws IOException { // prepare flush context is carried via PrepareFlushResult TreeMap storeFlushCtxs = prepareResult.storeFlushCtxs; TreeMap> committedFiles = prepareResult.committedFiles; long startTime = prepareResult.startTime; long flushOpSeqId = prepareResult.flushOpSeqId; long flushedSeqId = prepareResult.flushedSeqId; String s = "Flushing stores of " + this; status.setStatus(s); if (LOG.isTraceEnabled()) LOG.trace(s); // Any failure from here on out will be catastrophic requiring server // restart so wal content can be replayed and put back into the memstore. // Otherwise, the snapshot content while backed up in the wal, it will not // be part of the current running servers state. boolean compactionRequested = false; long flushedOutputFileSize = 0; try { // A. Flush memstore to all the HStores. // Keep running vector of all store files that includes both old and the // just-made new flush store file. The new flushed file is still in the // tmp directory. for (StoreFlushContext flush : storeFlushCtxs.values()) { flush.flushCache(status); } // Switch snapshot (in memstore) -> new hfile (thus causing // all the store scanners to reset/reseek). for (Map.Entry flushEntry : storeFlushCtxs.entrySet()) { StoreFlushContext sfc = flushEntry.getValue(); boolean needsCompaction = sfc.commit(status); if (needsCompaction) { compactionRequested = true; } byte[] storeName = flushEntry.getKey(); List storeCommittedFiles = sfc.getCommittedFiles(); committedFiles.put(storeName, storeCommittedFiles); // Flush committed no files, indicating flush is empty or flush was canceled if (storeCommittedFiles == null || storeCommittedFiles.isEmpty()) { MemStoreSize storeFlushableSize = prepareResult.storeFlushableSize.get(storeName); prepareResult.totalFlushableSize.decMemStoreSize(storeFlushableSize); } flushedOutputFileSize += sfc.getOutputFileSize(); } storeFlushCtxs.clear(); // Set down the memstore size by amount of flush. MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize(); this.decrMemStoreSize(mss); // Increase the size of this Region for the purposes of quota. Noop if quotas are disabled. // During startup, quota manager may not be initialized yet. if (rsServices != null) { RegionServerSpaceQuotaManager quotaManager = rsServices.getRegionServerSpaceQuotaManager(); if (quotaManager != null) { quotaManager.getRegionSizeStore().incrementRegionSize( this.getRegionInfo(), flushedOutputFileSize); } } if (wal != null) { // write flush marker to WAL. If fail, we should throw DroppedSnapshotException FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH, getRegionInfo(), flushOpSeqId, committedFiles); WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true, mvcc); } } catch (Throwable t) { // An exception here means that the snapshot was not persisted. // The wal needs to be replayed so its content is restored to memstore. // Currently, only a server restart will do this. // We used to only catch IOEs but its possible that we'd get other // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch // all and sundry. if (wal != null) { try { FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH, getRegionInfo(), flushOpSeqId, committedFiles); WALUtil.writeFlushMarker(wal, this.replicationScope, getRegionInfo(), desc, false, mvcc); } catch (Throwable ex) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "failed writing ABORT_FLUSH marker to WAL", ex); // ignore this since we will be aborting the RS with DSE. } wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); } DroppedSnapshotException dse = new DroppedSnapshotException("region: " + Bytes.toStringBinary(getRegionInfo().getRegionName()), t); status.abort("Flush failed: " + StringUtils.stringifyException(t)); // Callers for flushcache() should catch DroppedSnapshotException and abort the region server. // However, since we may have the region read lock, we cannot call close(true) here since // we cannot promote to a write lock. Instead we are setting closing so that all other region // operations except for close will be rejected. this.closing.set(true); if (rsServices != null) { // This is a safeguard against the case where the caller fails to explicitly handle aborting rsServices.abort("Replay of WAL required. Forcing server shutdown", dse); } throw dse; } // If we get to here, the HStores have been written. if (wal != null) { wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes(), flushedSeqId); } // Record latest flush time for (HStore store: storesToFlush) { this.lastStoreFlushTimeMap.put(store, startTime); } this.maxFlushedSeqId = flushedSeqId; this.lastFlushOpSeqId = flushOpSeqId; // C. Finally notify anyone waiting on memstore to clear: // e.g. checkResources(). synchronized (this) { notifyAll(); // FindBugs NN_NAKED_NOTIFY } long time = EnvironmentEdgeManager.currentTime() - startTime; MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize(); long memstoresize = this.memStoreSizing.getMemStoreSize().getDataSize(); String msg = "Finished flush of" + " dataSize ~" + StringUtils.byteDesc(mss.getDataSize()) + "/" + mss.getDataSize() + ", heapSize ~" + StringUtils.byteDesc(mss.getHeapSize()) + "/" + mss.getHeapSize() + ", currentSize=" + StringUtils.byteDesc(memstoresize) + "/" + memstoresize + " for " + this.getRegionInfo().getEncodedName() + " in " + time + "ms, sequenceid=" + flushOpSeqId + ", compaction requested=" + compactionRequested + ((wal == null) ? "; wal=null" : ""); LOG.info(msg); status.setStatus(msg); if (rsServices != null && rsServices.getMetrics() != null) { rsServices.getMetrics().updateFlush(getTableDescriptor().getTableName().getNameAsString(), time, mss.getDataSize(), flushedOutputFileSize); } return new FlushResultImpl(compactionRequested ? FlushResult.Result.FLUSHED_COMPACTION_NEEDED : FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushOpSeqId); } /** * Method to safely get the next sequence number. * @return Next sequence number unassociated with any actual edit. * @throws IOException */ @VisibleForTesting protected long getNextSequenceId(final WAL wal) throws IOException { WriteEntry we = mvcc.begin(); mvcc.completeAndWait(we); return we.getWriteNumber(); } ////////////////////////////////////////////////////////////////////////////// // get() methods for client use. ////////////////////////////////////////////////////////////////////////////// @Override public RegionScannerImpl getScanner(Scan scan) throws IOException { return getScanner(scan, null); } @Override public RegionScannerImpl getScanner(Scan scan, List additionalScanners) throws IOException { return getScanner(scan, additionalScanners, HConstants.NO_NONCE, HConstants.NO_NONCE); } private RegionScannerImpl getScanner(Scan scan, List additionalScanners, long nonceGroup, long nonce) throws IOException { startRegionOperation(Operation.SCAN); try { // Verify families are all valid if (!scan.hasFamilies()) { // Adding all families to scanner for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { scan.addFamily(family); } } else { for (byte[] family : scan.getFamilyMap().keySet()) { checkFamily(family); } } return instantiateRegionScanner(scan, additionalScanners, nonceGroup, nonce); } finally { closeRegionOperation(Operation.SCAN); } } protected RegionScanner instantiateRegionScanner(Scan scan, List additionalScanners) throws IOException { return instantiateRegionScanner(scan, additionalScanners, HConstants.NO_NONCE, HConstants.NO_NONCE); } protected RegionScannerImpl instantiateRegionScanner(Scan scan, List additionalScanners, long nonceGroup, long nonce) throws IOException { if (scan.isReversed()) { if (scan.getFilter() != null) { scan.getFilter().setReversed(true); } return new ReversedRegionScannerImpl(scan, additionalScanners, this); } return new RegionScannerImpl(scan, additionalScanners, this, nonceGroup, nonce); } /** * Prepare a delete for a row mutation processor * @param delete The passed delete is modified by this method. WARNING! * @throws IOException */ public void prepareDelete(Delete delete) throws IOException { // Check to see if this is a deleteRow insert if(delete.getFamilyCellMap().isEmpty()){ for(byte [] family : this.htableDescriptor.getColumnFamilyNames()){ // Don't eat the timestamp delete.addFamily(family, delete.getTimestamp()); } } else { for(byte [] family : delete.getFamilyCellMap().keySet()) { if(family == null) { throw new NoSuchColumnFamilyException("Empty family is invalid"); } checkFamily(family); } } } @Override public void delete(Delete delete) throws IOException { checkReadOnly(); checkResources(); startRegionOperation(Operation.DELETE); try { // All edits for the given row (across all column families) must happen atomically. doBatchMutate(delete); } finally { closeRegionOperation(Operation.DELETE); } } /** * Row needed by below method. */ private static final byte [] FOR_UNIT_TESTS_ONLY = Bytes.toBytes("ForUnitTestsOnly"); /** * This is used only by unit tests. Not required to be a public API. * @param familyMap map of family to edits for the given family. * @throws IOException */ void delete(NavigableMap> familyMap, Durability durability) throws IOException { Delete delete = new Delete(FOR_UNIT_TESTS_ONLY); delete.setFamilyCellMap(familyMap); delete.setDurability(durability); doBatchMutate(delete); } /** * Set up correct timestamps in the KVs in Delete object. *

Caller should have the row and region locks. * @param mutation * @param familyMap * @param byteNow * @throws IOException */ public void prepareDeleteTimestamps(Mutation mutation, Map> familyMap, byte[] byteNow) throws IOException { for (Map.Entry> e : familyMap.entrySet()) { byte[] family = e.getKey(); List cells = e.getValue(); assert cells instanceof RandomAccess; Map kvCount = new TreeMap<>(Bytes.BYTES_COMPARATOR); int listSize = cells.size(); for (int i=0; i < listSize; i++) { Cell cell = cells.get(i); // Check if time is LATEST, change to time of most recent addition if so // This is expensive. if (cell.getTimestamp() == HConstants.LATEST_TIMESTAMP && PrivateCellUtil.isDeleteType(cell)) { byte[] qual = CellUtil.cloneQualifier(cell); Integer count = kvCount.get(qual); if (count == null) { kvCount.put(qual, 1); } else { kvCount.put(qual, count + 1); } count = kvCount.get(qual); Get get = new Get(CellUtil.cloneRow(cell)); get.setMaxVersions(count); get.addColumn(family, qual); if (coprocessorHost != null) { if (!coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell, byteNow, get)) { updateDeleteLatestVersionTimestamp(cell, get, count, byteNow); } } else { updateDeleteLatestVersionTimestamp(cell, get, count, byteNow); } } else { PrivateCellUtil.updateLatestStamp(cell, byteNow); } } } } void updateDeleteLatestVersionTimestamp(Cell cell, Get get, int count, byte[] byteNow) throws IOException { List result = get(get, false); if (result.size() < count) { // Nothing to delete PrivateCellUtil.updateLatestStamp(cell, byteNow); return; } if (result.size() > count) { throw new RuntimeException("Unexpected size: " + result.size()); } Cell getCell = result.get(count - 1); PrivateCellUtil.setTimestamp(cell, getCell.getTimestamp()); } @Override public void put(Put put) throws IOException { checkReadOnly(); // Do a rough check that we have resources to accept a write. The check is // 'rough' in that between the resource check and the call to obtain a // read lock, resources may run out. For now, the thought is that this // will be extremely rare; we'll deal with it when it happens. checkResources(); startRegionOperation(Operation.PUT); try { // All edits for the given row (across all column families) must happen atomically. doBatchMutate(put); } finally { closeRegionOperation(Operation.PUT); } } /** * Class that tracks the progress of a batch operations, accumulating status codes and tracking * the index at which processing is proceeding. These batch operations may get split into * mini-batches for processing. */ private abstract static class BatchOperation { protected final T[] operations; protected final OperationStatus[] retCodeDetails; protected final WALEdit[] walEditsFromCoprocessors; // reference family cell maps directly so coprocessors can mutate them if desired protected final Map>[] familyCellMaps; protected final HRegion region; protected int nextIndexToProcess = 0; protected final ObservedExceptionsInBatch observedExceptions; //Durability of the batch (highest durability of all operations) protected Durability durability; protected boolean atomic = false; public BatchOperation(final HRegion region, T[] operations) { this.operations = operations; this.retCodeDetails = new OperationStatus[operations.length]; Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN); this.walEditsFromCoprocessors = new WALEdit[operations.length]; familyCellMaps = new Map[operations.length]; this.region = region; observedExceptions = new ObservedExceptionsInBatch(); durability = Durability.USE_DEFAULT; } /** * Visitor interface for batch operations */ @FunctionalInterface public interface Visitor { /** * @param index operation index * @return If true continue visiting remaining entries, break otherwise */ boolean visit(int index) throws IOException; } /** * Helper method for visiting pending/ all batch operations */ public void visitBatchOperations(boolean pendingOnly, int lastIndexExclusive, Visitor visitor) throws IOException { assert lastIndexExclusive <= this.size(); for (int i = nextIndexToProcess; i < lastIndexExclusive; i++) { if (!pendingOnly || isOperationPending(i)) { if (!visitor.visit(i)) { break; } } } } public abstract Mutation getMutation(int index); public abstract long getNonceGroup(int index); public abstract long getNonce(int index); /** * This method is potentially expensive and useful mostly for non-replay CP path. */ public abstract Mutation[] getMutationsForCoprocs(); public abstract boolean isInReplay(); public abstract long getOrigLogSeqNum(); public abstract void startRegionOperation() throws IOException; public abstract void closeRegionOperation() throws IOException; /** * Validates each mutation and prepares a batch for write. If necessary (non-replay case), runs * CP prePut()/ preDelete() hooks for all mutations in a batch. This is intended to operate on * entire batch and will be called from outside of class to check and prepare batch. This can * be implemented by calling helper method {@link #checkAndPrepareMutation(int, long)} in a * 'for' loop over mutations. */ public abstract void checkAndPrepare() throws IOException; /** * Implement any Put request specific check and prepare logic here. Please refer to * {@link #checkAndPrepareMutation(Mutation, long)} for how its used. */ protected abstract void checkAndPreparePut(final Put p) throws IOException; /** * If necessary, calls preBatchMutate() CP hook for a mini-batch and updates metrics, cell * count, tags and timestamp for all cells of all operations in a mini-batch. */ public abstract void prepareMiniBatchOperations(MiniBatchOperationInProgress miniBatchOp, long timestamp, final List acquiredRowLocks) throws IOException; /** * Write mini-batch operations to MemStore */ public abstract WriteEntry writeMiniBatchOperationsToMemStore( final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry) throws IOException; protected void writeMiniBatchOperationsToMemStore( final MiniBatchOperationInProgress miniBatchOp, final long writeNumber) throws IOException { MemStoreSizing memStoreAccounting = new NonThreadSafeMemStoreSizing(); visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { // We need to update the sequence id for following reasons. // 1) If the op is in replay mode, FSWALEntry#stampRegionSequenceId won't stamp sequence id. // 2) If no WAL, FSWALEntry won't be used // we use durability of the original mutation for the mutation passed by CP. if (isInReplay() || getMutation(index).getDurability() == Durability.SKIP_WAL) { region.updateSequenceId(familyCellMaps[index].values(), writeNumber); } applyFamilyMapToMemStore(familyCellMaps[index], memStoreAccounting); return true; }); // update memStore size region.incMemStoreSize(memStoreAccounting.getDataSize(), memStoreAccounting.getHeapSize(), memStoreAccounting.getOffHeapSize(), memStoreAccounting.getCellsCount()); } public boolean isDone() { return nextIndexToProcess == operations.length; } public int size() { return operations.length; } public boolean isOperationPending(int index) { return retCodeDetails[index].getOperationStatusCode() == OperationStatusCode.NOT_RUN; } public List getClusterIds() { assert size() != 0; return getMutation(0).getClusterIds(); } boolean isAtomic() { return atomic; } /** * Helper method that checks and prepares only one mutation. This can be used to implement * {@link #checkAndPrepare()} for entire Batch. * NOTE: As CP prePut()/ preDelete() hooks may modify mutations, this method should be called * after prePut()/ preDelete() CP hooks are run for the mutation */ protected void checkAndPrepareMutation(Mutation mutation, final long timestamp) throws IOException { region.checkRow(mutation.getRow(), "batchMutate"); if (mutation instanceof Put) { // Check the families in the put. If bad, skip this one. checkAndPreparePut((Put) mutation); region.checkTimestamps(mutation.getFamilyCellMap(), timestamp); } else { region.prepareDelete((Delete) mutation); } } protected void checkAndPrepareMutation(int index, long timestamp) throws IOException { Mutation mutation = getMutation(index); try { this.checkAndPrepareMutation(mutation, timestamp); // store the family map reference to allow for mutations familyCellMaps[index] = mutation.getFamilyCellMap(); // store durability for the batch (highest durability of all operations in the batch) Durability tmpDur = region.getEffectiveDurability(mutation.getDurability()); if (tmpDur.ordinal() > durability.ordinal()) { durability = tmpDur; } } catch (NoSuchColumnFamilyException nscfe) { final String msg = "No such column family in batch mutation in region " + this; if (observedExceptions.hasSeenNoSuchFamily()) { LOG.warn(msg + nscfe.getMessage()); } else { LOG.warn(msg, nscfe); observedExceptions.sawNoSuchFamily(); } retCodeDetails[index] = new OperationStatus( OperationStatusCode.BAD_FAMILY, nscfe.getMessage()); if (isAtomic()) { // fail, atomic means all or none throw nscfe; } } catch (FailedSanityCheckException fsce) { final String msg = "Batch Mutation did not pass sanity check in region " + this; if (observedExceptions.hasSeenFailedSanityCheck()) { LOG.warn(msg + fsce.getMessage()); } else { LOG.warn(msg, fsce); observedExceptions.sawFailedSanityCheck(); } retCodeDetails[index] = new OperationStatus( OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage()); if (isAtomic()) { throw fsce; } } catch (WrongRegionException we) { final String msg = "Batch mutation had a row that does not belong to this region " + this; if (observedExceptions.hasSeenWrongRegion()) { LOG.warn(msg + we.getMessage()); } else { LOG.warn(msg, we); observedExceptions.sawWrongRegion(); } retCodeDetails[index] = new OperationStatus( OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage()); if (isAtomic()) { throw we; } } } /** * Creates Mini-batch of all operations [nextIndexToProcess, lastIndexExclusive) for which * a row lock can be acquired. All mutations with locked rows are considered to be * In-progress operations and hence the name {@link MiniBatchOperationInProgress}. Mini batch * is window over {@link BatchOperation} and contains contiguous pending operations. * * @param acquiredRowLocks keeps track of rowLocks acquired. */ public MiniBatchOperationInProgress lockRowsAndBuildMiniBatch( List acquiredRowLocks) throws IOException { int readyToWriteCount = 0; int lastIndexExclusive = 0; RowLock prevRowLock = null; for (; lastIndexExclusive < size(); lastIndexExclusive++) { // It reaches the miniBatchSize, stop here and process the miniBatch // This only applies to non-atomic batch operations. if (!isAtomic() && (readyToWriteCount == region.miniBatchSize)) { break; } if (!isOperationPending(lastIndexExclusive)) { continue; } // HBASE-19389 Limit concurrency of put with dense (hundreds) columns to avoid exhausting // RS handlers, covering both MutationBatchOperation and ReplayBatchOperation // The BAD_FAMILY/SANITY_CHECK_FAILURE cases are handled in checkAndPrepare phase and won't // pass the isOperationPending check Map> curFamilyCellMap = getMutation(lastIndexExclusive).getFamilyCellMap(); try { // start the protector before acquiring row lock considering performance, and will finish // it when encountering exception region.storeHotnessProtector.start(curFamilyCellMap); } catch (RegionTooBusyException rtbe) { region.storeHotnessProtector.finish(curFamilyCellMap); if (isAtomic()) { throw rtbe; } retCodeDetails[lastIndexExclusive] = new OperationStatus(OperationStatusCode.STORE_TOO_BUSY, rtbe.getMessage()); continue; } Mutation mutation = getMutation(lastIndexExclusive); // If we haven't got any rows in our batch, we should block to get the next one. RowLock rowLock = null; boolean throwException = false; try { // if atomic then get exclusive lock, else shared lock rowLock = region.getRowLockInternal(mutation.getRow(), !isAtomic(), prevRowLock); } catch (TimeoutIOException | InterruptedIOException e) { // NOTE: We will retry when other exceptions, but we should stop if we receive // TimeoutIOException or InterruptedIOException as operation has timed out or // interrupted respectively. throwException = true; throw e; } catch (IOException ioe) { LOG.warn("Failed getting lock, row={}, in region {}", Bytes.toStringBinary(mutation.getRow()), this, ioe); if (isAtomic()) { // fail, atomic means all or none throwException = true; throw ioe; } } catch (Throwable throwable) { throwException = true; throw throwable; } finally { if (throwException) { region.storeHotnessProtector.finish(curFamilyCellMap); } } if (rowLock == null) { // We failed to grab another lock if (isAtomic()) { region.storeHotnessProtector.finish(curFamilyCellMap); throw new IOException("Can't apply all operations atomically!"); } break; // Stop acquiring more rows for this batch } else { if (rowLock != prevRowLock) { // It is a different row now, add this to the acquiredRowLocks and // set prevRowLock to the new returned rowLock acquiredRowLocks.add(rowLock); prevRowLock = rowLock; } } readyToWriteCount++; } return createMiniBatch(lastIndexExclusive, readyToWriteCount); } protected MiniBatchOperationInProgress createMiniBatch(final int lastIndexExclusive, final int readyToWriteCount) { return new MiniBatchOperationInProgress<>(getMutationsForCoprocs(), retCodeDetails, walEditsFromCoprocessors, nextIndexToProcess, lastIndexExclusive, readyToWriteCount); } /** * Builds separate WALEdit per nonce by applying input mutations. If WALEdits from CP are * present, they are merged to result WALEdit. */ public List> buildWALEdits( final MiniBatchOperationInProgress miniBatchOp) throws IOException { List> walEdits = new ArrayList<>(); visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), new Visitor() { private Pair curWALEditForNonce; @Override public boolean visit(int index) throws IOException { Mutation m = getMutation(index); // we use durability of the original mutation for the mutation passed by CP. if (region.getEffectiveDurability(m.getDurability()) == Durability.SKIP_WAL) { region.recordMutationWithoutWal(m.getFamilyCellMap()); return true; } // the batch may contain multiple nonce keys (replay case). If so, write WALEdit for each. // Given how nonce keys are originally written, these should be contiguous. // They don't have to be, it will still work, just write more WALEdits than needed. long nonceGroup = getNonceGroup(index); long nonce = getNonce(index); if (curWALEditForNonce == null || curWALEditForNonce.getFirst().getNonceGroup() != nonceGroup || curWALEditForNonce.getFirst().getNonce() != nonce) { curWALEditForNonce = new Pair<>(new NonceKey(nonceGroup, nonce), new WALEdit(miniBatchOp.getCellCount(), isInReplay())); walEdits.add(curWALEditForNonce); } WALEdit walEdit = curWALEditForNonce.getSecond(); // Add WAL edits from CPs. WALEdit fromCP = walEditsFromCoprocessors[index]; if (fromCP != null) { for (Cell cell : fromCP.getCells()) { walEdit.add(cell); } } walEdit.add(familyCellMaps[index]); return true; } }); return walEdits; } /** * This method completes mini-batch operations by calling postBatchMutate() CP hook (if * required) and completing mvcc. */ public void completeMiniBatchOperations( final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry) throws IOException { if (writeEntry != null) { region.mvcc.completeAndWait(writeEntry); } } public void doPostOpCleanupForMiniBatch( final MiniBatchOperationInProgress miniBatchOp, final WALEdit walEdit, boolean success) throws IOException { doFinishHotnessProtector(miniBatchOp); } private void doFinishHotnessProtector( final MiniBatchOperationInProgress miniBatchOp) { // check and return if the protector is not enabled if (!region.storeHotnessProtector.isEnable()) { return; } // miniBatchOp is null, if and only if lockRowsAndBuildMiniBatch throwing exception. // This case was handled. if (miniBatchOp == null) { return; } final int finalLastIndexExclusive = miniBatchOp.getLastIndexExclusive(); for (int i = nextIndexToProcess; i < finalLastIndexExclusive; i++) { switch (retCodeDetails[i].getOperationStatusCode()) { case SUCCESS: case FAILURE: region.storeHotnessProtector.finish(getMutation(i).getFamilyCellMap()); break; default: // do nothing // We won't start the protector for NOT_RUN/BAD_FAMILY/SANITY_CHECK_FAILURE and the // STORE_TOO_BUSY case is handled in StoreHotnessProtector#start break; } } } /** * Atomically apply the given map of family->edits to the memstore. * This handles the consistency control on its own, but the caller * should already have locked updatesLock.readLock(). This also does * not check the families for validity. * * @param familyMap Map of Cells by family */ protected void applyFamilyMapToMemStore(Map> familyMap, MemStoreSizing memstoreAccounting) throws IOException { for (Map.Entry> e : familyMap.entrySet()) { byte[] family = e.getKey(); List cells = e.getValue(); assert cells instanceof RandomAccess; region.applyToMemStore(region.getStore(family), cells, false, memstoreAccounting); } } } /** * Batch of mutation operations. Base class is shared with {@link ReplayBatchOperation} as most * of the logic is same. */ static class MutationBatchOperation extends BatchOperation { private long nonceGroup; private long nonce; public MutationBatchOperation(final HRegion region, Mutation[] operations, boolean atomic, long nonceGroup, long nonce) { super(region, operations); this.atomic = atomic; this.nonceGroup = nonceGroup; this.nonce = nonce; } @Override public Mutation getMutation(int index) { return this.operations[index]; } @Override public long getNonceGroup(int index) { return nonceGroup; } @Override public long getNonce(int index) { return nonce; } @Override public Mutation[] getMutationsForCoprocs() { return this.operations; } @Override public boolean isInReplay() { return false; } @Override public long getOrigLogSeqNum() { return SequenceId.NO_SEQUENCE_ID; } @Override public void startRegionOperation() throws IOException { region.startRegionOperation(Operation.BATCH_MUTATE); } @Override public void closeRegionOperation() throws IOException { region.closeRegionOperation(Operation.BATCH_MUTATE); } @Override public void checkAndPreparePut(Put p) throws IOException { region.checkFamilies(p.getFamilyCellMap().keySet()); } @Override public void checkAndPrepare() throws IOException { final int[] metrics = {0, 0}; // index 0: puts, index 1: deletes visitBatchOperations(true, this.size(), new Visitor() { private long now = EnvironmentEdgeManager.currentTime(); private WALEdit walEdit; @Override public boolean visit(int index) throws IOException { // Run coprocessor pre hook outside of locks to avoid deadlock if (region.coprocessorHost != null) { if (walEdit == null) { walEdit = new WALEdit(); } callPreMutateCPHook(index, walEdit, metrics); if (!walEdit.isEmpty()) { walEditsFromCoprocessors[index] = walEdit; walEdit = null; } } if (isOperationPending(index)) { // TODO: Currently validation is done with current time before acquiring locks and // updates are done with different timestamps after acquiring locks. This behavior is // inherited from the code prior to this change. Can this be changed? checkAndPrepareMutation(index, now); } return true; } }); // FIXME: we may update metrics twice! here for all operations bypassed by CP and later in // normal processing. // Update metrics in same way as it is done when we go the normal processing route (we now // update general metrics though a Coprocessor did the work). if (region.metricsRegion != null) { if (metrics[0] > 0) { // There were some Puts in the batch. region.metricsRegion.updatePut(); } if (metrics[1] > 0) { // There were some Deletes in the batch. region.metricsRegion.updateDelete(); } } } @Override public void prepareMiniBatchOperations(MiniBatchOperationInProgress miniBatchOp, long timestamp, final List acquiredRowLocks) throws IOException { byte[] byteTS = Bytes.toBytes(timestamp); visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { Mutation mutation = getMutation(index); if (mutation instanceof Put) { region.updateCellTimestamps(familyCellMaps[index].values(), byteTS); miniBatchOp.incrementNumOfPuts(); } else { region.prepareDeleteTimestamps(mutation, familyCellMaps[index], byteTS); miniBatchOp.incrementNumOfDeletes(); } region.rewriteCellTags(familyCellMaps[index], mutation); // update cell count if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) { for (List cells : mutation.getFamilyCellMap().values()) { miniBatchOp.addCellCount(cells.size()); } } WALEdit fromCP = walEditsFromCoprocessors[index]; if (fromCP != null) { miniBatchOp.addCellCount(fromCP.size()); } return true; }); if (region.coprocessorHost != null) { // calling the pre CP hook for batch mutation region.coprocessorHost.preBatchMutate(miniBatchOp); checkAndMergeCPMutations(miniBatchOp, acquiredRowLocks, timestamp); } } @Override public List> buildWALEdits(final MiniBatchOperationInProgress miniBatchOp) throws IOException { List> walEdits = super.buildWALEdits(miniBatchOp); // for MutationBatchOperation, more than one nonce is not allowed if (walEdits.size() > 1) { throw new IOException("Found multiple nonce keys per batch!"); } return walEdits; } @Override public WriteEntry writeMiniBatchOperationsToMemStore( final MiniBatchOperationInProgress miniBatchOp, @Nullable WriteEntry writeEntry) throws IOException { if (writeEntry == null) { writeEntry = region.mvcc.begin(); } super.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry.getWriteNumber()); return writeEntry; } @Override public void completeMiniBatchOperations( final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry) throws IOException { // TODO: can it be done after completing mvcc? // calling the post CP hook for batch mutation if (region.coprocessorHost != null) { region.coprocessorHost.postBatchMutate(miniBatchOp); } super.completeMiniBatchOperations(miniBatchOp, writeEntry); } @Override public void doPostOpCleanupForMiniBatch(MiniBatchOperationInProgress miniBatchOp, final WALEdit walEdit, boolean success) throws IOException { super.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, success); if (miniBatchOp != null) { // synced so that the coprocessor contract is adhered to. if (region.coprocessorHost != null) { visitBatchOperations(false, miniBatchOp.getLastIndexExclusive(), (int i) -> { // only for successful puts if (retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.SUCCESS) { Mutation m = getMutation(i); if (m instanceof Put) { region.coprocessorHost.postPut((Put) m, walEdit, m.getDurability()); } else { region.coprocessorHost.postDelete((Delete) m, walEdit, m.getDurability()); } } return true; }); } // See if the column families were consistent through the whole thing. // if they were then keep them. If they were not then pass a null. // null will be treated as unknown. // Total time taken might be involving Puts and Deletes. // Split the time for puts and deletes based on the total number of Puts and Deletes. if (region.metricsRegion != null) { if (miniBatchOp.getNumOfPuts() > 0) { // There were some Puts in the batch. region.metricsRegion.updatePut(); } if (miniBatchOp.getNumOfDeletes() > 0) { // There were some Deletes in the batch. region.metricsRegion.updateDelete(); } } } if (region.coprocessorHost != null) { // call the coprocessor hook to do any finalization steps after the put is done region.coprocessorHost.postBatchMutateIndispensably( miniBatchOp != null ? miniBatchOp : createMiniBatch(size(), 0), success); } } /** * Runs prePut/ preDelete coprocessor hook for input mutation in a batch * @param metrics Array of 2 ints. index 0: count of puts and index 1: count of deletes */ private void callPreMutateCPHook(int index, final WALEdit walEdit, final int[] metrics) throws IOException { Mutation m = getMutation(index); if (m instanceof Put) { if (region.coprocessorHost.prePut((Put) m, walEdit, m.getDurability())) { // pre hook says skip this Put // mark as success and skip in doMiniBatchMutation metrics[0]++; retCodeDetails[index] = OperationStatus.SUCCESS; } } else if (m instanceof Delete) { Delete curDel = (Delete) m; if (curDel.getFamilyCellMap().isEmpty()) { // handle deleting a row case // TODO: prepareDelete() has been called twice, before and after preDelete() CP hook. // Can this be avoided? region.prepareDelete(curDel); } if (region.coprocessorHost.preDelete(curDel, walEdit, m.getDurability())) { // pre hook says skip this Delete // mark as success and skip in doMiniBatchMutation metrics[1]++; retCodeDetails[index] = OperationStatus.SUCCESS; } } else { String msg = "Put/Delete mutations only supported in a batch"; // In case of passing Append mutations along with the Puts and Deletes in batchMutate // mark the operation return code as failure so that it will not be considered in // the doMiniBatchMutation retCodeDetails[index] = new OperationStatus(OperationStatusCode.FAILURE, msg); if (isAtomic()) { // fail, atomic means all or none throw new IOException(msg); } } } private void checkAndMergeCPMutations(final MiniBatchOperationInProgress miniBatchOp, final List acquiredRowLocks, final long timestamp) throws IOException { visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), (int i) -> { // we pass (i - firstIndex) below since the call expects a relative index Mutation[] cpMutations = miniBatchOp.getOperationsFromCoprocessors(i - nextIndexToProcess); if (cpMutations == null) { return true; } // Else Coprocessor added more Mutations corresponding to the Mutation at this index. Mutation mutation = getMutation(i); for (Mutation cpMutation : cpMutations) { this.checkAndPrepareMutation(cpMutation, timestamp); // Acquire row locks. If not, the whole batch will fail. acquiredRowLocks.add(region.getRowLockInternal(cpMutation.getRow(), true, null)); // Returned mutations from coprocessor correspond to the Mutation at index i. We can // directly add the cells from those mutations to the familyMaps of this mutation. Map> cpFamilyMap = cpMutation.getFamilyCellMap(); region.rewriteCellTags(cpFamilyMap, mutation); // will get added to the memStore later mergeFamilyMaps(familyCellMaps[i], cpFamilyMap); // The durability of returned mutation is replaced by the corresponding mutation. // If the corresponding mutation contains the SKIP_WAL, we shouldn't count the // cells of returned mutation. if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) { for (List cells : cpFamilyMap.values()) { miniBatchOp.addCellCount(cells.size()); } } } return true; }); } private void mergeFamilyMaps(Map> familyMap, Map> toBeMerged) { for (Map.Entry> entry : toBeMerged.entrySet()) { List cells = familyMap.get(entry.getKey()); if (cells == null) { familyMap.put(entry.getKey(), entry.getValue()); } else { cells.addAll(entry.getValue()); } } } } /** * Batch of mutations for replay. Base class is shared with {@link MutationBatchOperation} as most * of the logic is same. */ static class ReplayBatchOperation extends BatchOperation { private long origLogSeqNum = 0; public ReplayBatchOperation(final HRegion region, MutationReplay[] operations, long origLogSeqNum) { super(region, operations); this.origLogSeqNum = origLogSeqNum; } @Override public Mutation getMutation(int index) { return this.operations[index].mutation; } @Override public long getNonceGroup(int index) { return this.operations[index].nonceGroup; } @Override public long getNonce(int index) { return this.operations[index].nonce; } @Override public Mutation[] getMutationsForCoprocs() { return null; } @Override public boolean isInReplay() { return true; } @Override public long getOrigLogSeqNum() { return this.origLogSeqNum; } @Override public void startRegionOperation() throws IOException { region.startRegionOperation(Operation.REPLAY_BATCH_MUTATE); } @Override public void closeRegionOperation() throws IOException { region.closeRegionOperation(Operation.REPLAY_BATCH_MUTATE); } /** * During replay, there could exist column families which are removed between region server * failure and replay */ @Override protected void checkAndPreparePut(Put p) throws IOException { Map> familyCellMap = p.getFamilyCellMap(); List nonExistentList = null; for (byte[] family : familyCellMap.keySet()) { if (!region.htableDescriptor.hasColumnFamily(family)) { if (nonExistentList == null) { nonExistentList = new ArrayList<>(); } nonExistentList.add(family); } } if (nonExistentList != null) { for (byte[] family : nonExistentList) { // Perhaps schema was changed between crash and replay LOG.info("No family for {} omit from reply in region {}.", Bytes.toString(family), this); familyCellMap.remove(family); } } } @Override public void checkAndPrepare() throws IOException { long now = EnvironmentEdgeManager.currentTime(); visitBatchOperations(true, this.size(), (int index) -> { checkAndPrepareMutation(index, now); return true; }); } @Override public void prepareMiniBatchOperations(MiniBatchOperationInProgress miniBatchOp, long timestamp, final List acquiredRowLocks) throws IOException { visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { // update cell count for (List cells : getMutation(index).getFamilyCellMap().values()) { miniBatchOp.addCellCount(cells.size()); } return true; }); } @Override public WriteEntry writeMiniBatchOperationsToMemStore( final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry) throws IOException { super.writeMiniBatchOperationsToMemStore(miniBatchOp, getOrigLogSeqNum()); return writeEntry; } @Override public void completeMiniBatchOperations( final MiniBatchOperationInProgress miniBatchOp, final WriteEntry writeEntry) throws IOException { super.completeMiniBatchOperations(miniBatchOp, writeEntry); region.mvcc.advanceTo(getOrigLogSeqNum()); } } public OperationStatus[] batchMutate(Mutation[] mutations, long nonceGroup, long nonce) throws IOException { return batchMutate(mutations, false, nonceGroup, nonce); } public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, long nonceGroup, long nonce) throws IOException { // As it stands, this is used for 3 things // * batchMutate with single mutation - put/delete, separate or from checkAndMutate. // * coprocessor calls (see ex. BulkDeleteEndpoint). // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd... return batchMutate(new MutationBatchOperation(this, mutations, atomic, nonceGroup, nonce)); } @Override public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException { return batchMutate(mutations, HConstants.NO_NONCE, HConstants.NO_NONCE); } public OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId) throws IOException { if (!RegionReplicaUtil.isDefaultReplica(getRegionInfo()) && replaySeqId < lastReplayedOpenRegionSeqId) { // if it is a secondary replica we should ignore these entries silently // since they are coming out of order if (LOG.isTraceEnabled()) { LOG.trace(getRegionInfo().getEncodedName() + " : " + "Skipping " + mutations.length + " mutations with replaySeqId=" + replaySeqId + " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId); for (MutationReplay mut : mutations) { LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation); } } OperationStatus[] statuses = new OperationStatus[mutations.length]; for (int i = 0; i < statuses.length; i++) { statuses[i] = OperationStatus.SUCCESS; } return statuses; } return batchMutate(new ReplayBatchOperation(this, mutations, replaySeqId)); } /** * Perform a batch of mutations. * * It supports only Put and Delete mutations and will ignore other types passed. Operations in * a batch are stored with highest durability specified of for all operations in a batch, * except for {@link Durability#SKIP_WAL}. * *

This function is called from {@link #batchReplay(WALSplitUtil.MutationReplay[], long)} with * {@link ReplayBatchOperation} instance and {@link #batchMutate(Mutation[], long, long)} with * {@link MutationBatchOperation} instance as an argument. As the processing of replay batch * and mutation batch is very similar, lot of code is shared by providing generic methods in * base class {@link BatchOperation}. The logic for this method and * {@link #doMiniBatchMutate(BatchOperation)} is implemented using methods in base class which * are overridden by derived classes to implement special behavior. * * @param batchOp contains the list of mutations * @return an array of OperationStatus which internally contains the * OperationStatusCode and the exceptionMessage if any. * @throws IOException if an IO problem is encountered */ OperationStatus[] batchMutate(BatchOperation batchOp) throws IOException { boolean initialized = false; batchOp.startRegionOperation(); try { while (!batchOp.isDone()) { if (!batchOp.isInReplay()) { checkReadOnly(); } checkResources(); if (!initialized) { this.writeRequestsCount.add(batchOp.size()); // validate and prepare batch for write, for MutationBatchOperation it also calls CP // prePut()/ preDelete() hooks batchOp.checkAndPrepare(); initialized = true; } doMiniBatchMutate(batchOp); requestFlushIfNeeded(); } } finally { if (rsServices != null && rsServices.getMetrics() != null) { rsServices.getMetrics().updateWriteQueryMeter(this.htableDescriptor. getTableName(), batchOp.size()); } batchOp.closeRegionOperation(); } return batchOp.retCodeDetails; } /** * Called to do a piece of the batch that came in to {@link #batchMutate(Mutation[], long, long)} * In here we also handle replay of edits on region recover. Also gets change in size brought * about by applying {@code batchOp}. */ private void doMiniBatchMutate(BatchOperation batchOp) throws IOException { boolean success = false; WALEdit walEdit = null; WriteEntry writeEntry = null; boolean locked = false; // We try to set up a batch in the range [batchOp.nextIndexToProcess,lastIndexExclusive) MiniBatchOperationInProgress miniBatchOp = null; /** Keep track of the locks we hold so we can release them in finally clause */ List acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.size()); try { // STEP 1. Try to acquire as many locks as we can and build mini-batch of operations with // locked rows miniBatchOp = batchOp.lockRowsAndBuildMiniBatch(acquiredRowLocks); // We've now grabbed as many mutations off the list as we can // Ensure we acquire at least one. if (miniBatchOp.getReadyToWriteCount() <= 0) { // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily? return; } lock(this.updatesLock.readLock(), miniBatchOp.getReadyToWriteCount()); locked = true; // STEP 2. Update mini batch of all operations in progress with LATEST_TIMESTAMP timestamp // We should record the timestamp only after we have acquired the rowLock, // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp long now = EnvironmentEdgeManager.currentTime(); batchOp.prepareMiniBatchOperations(miniBatchOp, now, acquiredRowLocks); // STEP 3. Build WAL edit List> walEdits = batchOp.buildWALEdits(miniBatchOp); // STEP 4. Append the WALEdits to WAL and sync. for(Iterator> it = walEdits.iterator(); it.hasNext();) { Pair nonceKeyWALEditPair = it.next(); walEdit = nonceKeyWALEditPair.getSecond(); NonceKey nonceKey = nonceKeyWALEditPair.getFirst(); if (walEdit != null && !walEdit.isEmpty()) { writeEntry = doWALAppend(walEdit, batchOp.durability, batchOp.getClusterIds(), now, nonceKey.getNonceGroup(), nonceKey.getNonce(), batchOp.getOrigLogSeqNum()); } // Complete mvcc for all but last writeEntry (for replay case) if (it.hasNext() && writeEntry != null) { mvcc.complete(writeEntry); writeEntry = null; } } // STEP 5. Write back to memStore // NOTE: writeEntry can be null here writeEntry = batchOp.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry); // STEP 6. Complete MiniBatchOperations: If required calls postBatchMutate() CP hook and // complete mvcc for last writeEntry batchOp.completeMiniBatchOperations(miniBatchOp, writeEntry); writeEntry = null; success = true; } finally { // Call complete rather than completeAndWait because we probably had error if walKey != null if (writeEntry != null) mvcc.complete(writeEntry); if (locked) { this.updatesLock.readLock().unlock(); } releaseRowLocks(acquiredRowLocks); final int finalLastIndexExclusive = miniBatchOp != null ? miniBatchOp.getLastIndexExclusive() : batchOp.size(); final boolean finalSuccess = success; batchOp.visitBatchOperations(true, finalLastIndexExclusive, (int i) -> { batchOp.retCodeDetails[i] = finalSuccess ? OperationStatus.SUCCESS : OperationStatus.FAILURE; return true; }); batchOp.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, finalSuccess); batchOp.nextIndexToProcess = finalLastIndexExclusive; } } /** * Returns effective durability from the passed durability and * the table descriptor. */ protected Durability getEffectiveDurability(Durability d) { return d == Durability.USE_DEFAULT ? this.regionDurability : d; } @Override public boolean checkAndMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op, ByteArrayComparable comparator, TimeRange timeRange, Mutation mutation) throws IOException { return doCheckAndRowMutate(row, family, qualifier, op, comparator, null, timeRange, null, mutation); } @Override public boolean checkAndMutate(byte[] row, Filter filter, TimeRange timeRange, Mutation mutation) throws IOException { return doCheckAndRowMutate(row, null, null, null, null, filter, timeRange, null, mutation); } @Override public boolean checkAndRowMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op, ByteArrayComparable comparator, TimeRange timeRange, RowMutations rm) throws IOException { return doCheckAndRowMutate(row, family, qualifier, op, comparator, null, timeRange, rm, null); } @Override public boolean checkAndRowMutate(byte[] row, Filter filter, TimeRange timeRange, RowMutations rm) throws IOException { return doCheckAndRowMutate(row, null, null, null, null, filter, timeRange, rm, null); } /** * checkAndMutate and checkAndRowMutate are 90% the same. Rather than copy/paste, below has * switches in the few places where there is deviation. */ private boolean doCheckAndRowMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op, ByteArrayComparable comparator, Filter filter, TimeRange timeRange, RowMutations rowMutations, Mutation mutation) throws IOException { // Could do the below checks but seems wacky with two callers only. Just comment out for now. // One caller passes a Mutation, the other passes RowMutation. Presume all good so we don't // need these commented out checks. // if (rowMutations == null && mutation == null) throw new DoNotRetryIOException("Both null"); // if (rowMutations != null && mutation != null) throw new DoNotRetryIOException("Both set"); if (mutation != null) { checkMutationType(mutation); checkRow(mutation, row); } else { checkRow(rowMutations, row); } checkReadOnly(); // TODO, add check for value length also move this check to the client checkResources(); startRegionOperation(); try { Get get = new Get(row); if (family != null) { checkFamily(family); get.addColumn(family, qualifier); } if (filter != null) { get.setFilter(filter); } if (timeRange != null) { get.setTimeRange(timeRange.getMin(), timeRange.getMax()); } // Lock row - note that doBatchMutate will relock this row if called checkRow(row, "doCheckAndRowMutate"); RowLock rowLock = getRowLockInternal(get.getRow(), false, null); try { if (mutation != null && this.getCoprocessorHost() != null) { // Call coprocessor. Boolean processed = null; if (mutation instanceof Put) { if (filter != null) { processed = this.getCoprocessorHost() .preCheckAndPutAfterRowLock(row, filter, (Put) mutation); } else { processed = this.getCoprocessorHost() .preCheckAndPutAfterRowLock(row, family, qualifier, op, comparator, (Put) mutation); } } else if (mutation instanceof Delete) { if (filter != null) { processed = this.getCoprocessorHost() .preCheckAndDeleteAfterRowLock(row, filter, (Delete) mutation); } else { processed = this.getCoprocessorHost() .preCheckAndDeleteAfterRowLock(row, family, qualifier, op, comparator, (Delete) mutation); } } if (processed != null) { return processed; } } // NOTE: We used to wait here until mvcc caught up: mvcc.await(); // Supposition is that now all changes are done under row locks, then when we go to read, // we'll get the latest on this row. List result = get(get, false); boolean matches = false; long cellTs = 0; if (filter != null) { if (!result.isEmpty()) { matches = true; cellTs = result.get(0).getTimestamp(); } } else { boolean valueIsNull = comparator.getValue() == null || comparator.getValue().length == 0; if (result.isEmpty() && valueIsNull) { matches = true; } else if (result.size() > 0 && result.get(0).getValueLength() == 0 && valueIsNull) { matches = true; cellTs = result.get(0).getTimestamp(); } else if (result.size() == 1 && !valueIsNull) { Cell kv = result.get(0); cellTs = kv.getTimestamp(); int compareResult = PrivateCellUtil.compareValue(kv, comparator); matches = matches(op, compareResult); } } // If matches put the new put or delete the new delete if (matches) { // We have acquired the row lock already. If the system clock is NOT monotonically // non-decreasing (see HBASE-14070) we should make sure that the mutation has a // larger timestamp than what was observed via Get. doBatchMutate already does this, but // there is no way to pass the cellTs. See HBASE-14054. long now = EnvironmentEdgeManager.currentTime(); long ts = Math.max(now, cellTs); // ensure write is not eclipsed byte[] byteTs = Bytes.toBytes(ts); if (mutation != null) { if (mutation instanceof Put) { updateCellTimestamps(mutation.getFamilyCellMap().values(), byteTs); } // And else 'delete' is not needed since it already does a second get, and sets the // timestamp from get (see prepareDeleteTimestamps). } else { for (Mutation m: rowMutations.getMutations()) { if (m instanceof Put) { updateCellTimestamps(m.getFamilyCellMap().values(), byteTs); } } // And else 'delete' is not needed since it already does a second get, and sets the // timestamp from get (see prepareDeleteTimestamps). } // All edits for the given row (across all column families) must happen atomically. if (mutation != null) { doBatchMutate(mutation); } else { mutateRow(rowMutations); } this.checkAndMutateChecksPassed.increment(); return true; } this.checkAndMutateChecksFailed.increment(); return false; } finally { rowLock.release(); } } finally { closeRegionOperation(); } } private void checkMutationType(final Mutation mutation) throws DoNotRetryIOException { boolean isPut = mutation instanceof Put; if (!isPut && !(mutation instanceof Delete)) { throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must be Put or Delete"); } } private void checkRow(final Row action, final byte[] row) throws DoNotRetryIOException { if (!Bytes.equals(row, action.getRow())) { throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's getRow must match"); } } private boolean matches(final CompareOperator op, final int compareResult) { boolean matches = false; switch (op) { case LESS: matches = compareResult < 0; break; case LESS_OR_EQUAL: matches = compareResult <= 0; break; case EQUAL: matches = compareResult == 0; break; case NOT_EQUAL: matches = compareResult != 0; break; case GREATER_OR_EQUAL: matches = compareResult >= 0; break; case GREATER: matches = compareResult > 0; break; default: throw new RuntimeException("Unknown Compare op " + op.name()); } return matches; } private void doBatchMutate(Mutation mutation) throws IOException { // Currently this is only called for puts and deletes, so no nonces. OperationStatus[] batchMutate = this.batchMutate(new Mutation[]{mutation}); if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) { throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg()); } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) { throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg()); } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.STORE_TOO_BUSY)) { throw new RegionTooBusyException(batchMutate[0].getExceptionMsg()); } } /** * Complete taking the snapshot on the region. Writes the region info and adds references to the * working snapshot directory. * * TODO for api consistency, consider adding another version with no {@link ForeignExceptionSnare} * arg. (In the future other cancellable HRegion methods could eventually add a * {@link ForeignExceptionSnare}, or we could do something fancier). * * @param desc snapshot description object * @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to * bail out. This is allowed to be null and will just be ignored in that case. * @throws IOException if there is an external or internal error causing the snapshot to fail */ public void addRegionToSnapshot(SnapshotDescription desc, ForeignExceptionSnare exnSnare) throws IOException { Path rootDir = CommonFSUtils.getRootDir(conf); Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir, conf); SnapshotManifest manifest = SnapshotManifest.create(conf, getFilesystem(), snapshotDir, desc, exnSnare); manifest.addRegion(this); } private void updateSequenceId(final Iterable> cellItr, final long sequenceId) throws IOException { for (List cells: cellItr) { if (cells == null) return; for (Cell cell : cells) { PrivateCellUtil.setSequenceId(cell, sequenceId); } } } /** * Replace any cell timestamps set to {@link org.apache.hadoop.hbase.HConstants#LATEST_TIMESTAMP} * provided current timestamp. * @param cellItr * @param now */ private static void updateCellTimestamps(final Iterable> cellItr, final byte[] now) throws IOException { for (List cells: cellItr) { if (cells == null) continue; // Optimization: 'foreach' loop is not used. See: // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects assert cells instanceof RandomAccess; int listSize = cells.size(); for (int i = 0; i < listSize; i++) { PrivateCellUtil.updateLatestStamp(cells.get(i), now); } } } /** * Possibly rewrite incoming cell tags. */ void rewriteCellTags(Map> familyMap, final Mutation m) { // Check if we have any work to do and early out otherwise // Update these checks as more logic is added here if (m.getTTL() == Long.MAX_VALUE) { return; } // From this point we know we have some work to do for (Map.Entry> e: familyMap.entrySet()) { List cells = e.getValue(); assert cells instanceof RandomAccess; int listSize = cells.size(); for (int i = 0; i < listSize; i++) { Cell cell = cells.get(i); List newTags = TagUtil.carryForwardTags(null, cell); newTags = TagUtil.carryForwardTTLTag(newTags, m.getTTL()); // Rewrite the cell with the updated set of tags cells.set(i, PrivateCellUtil.createCell(cell, newTags)); } } } /* * Check if resources to support an update. * * We throw RegionTooBusyException if above memstore limit * and expect client to retry using some kind of backoff */ void checkResources() throws RegionTooBusyException { // If catalog region, do not impose resource constraints or block updates. if (this.getRegionInfo().isMetaRegion()) return; MemStoreSize mss = this.memStoreSizing.getMemStoreSize(); if (mss.getHeapSize() + mss.getOffHeapSize() > this.blockingMemStoreSize) { blockedRequestsCount.increment(); requestFlush(); // Don't print current limit because it will vary too much. The message is used as a key // over in RetriesExhaustedWithDetailsException processing. final String regionName = this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getEncodedName(); final String serverName = this.getRegionServerServices() == null ? "unknown" : (this.getRegionServerServices().getServerName() == null ? "unknown" : this.getRegionServerServices().getServerName().toString()); RegionTooBusyException rtbe = new RegionTooBusyException( "Over memstore limit=" + org.apache.hadoop.hbase.procedure2.util.StringUtils .humanSize(this.blockingMemStoreSize) + ", regionName=" + regionName + ", server=" + serverName); LOG.warn("Region is too busy due to exceeding memstore size limit.", rtbe); throw rtbe; } } /** * @throws IOException Throws exception if region is in read-only mode. */ protected void checkReadOnly() throws IOException { if (isReadOnly()) { throw new DoNotRetryIOException("region is read only"); } } protected void checkReadsEnabled() throws IOException { if (!this.writestate.readsEnabled) { throw new IOException(getRegionInfo().getEncodedName() + ": The region's reads are disabled. Cannot serve the request"); } } public void setReadsEnabled(boolean readsEnabled) { if (readsEnabled && !this.writestate.readsEnabled) { LOG.info(getRegionInfo().getEncodedName() + " : Enabling reads for region."); } this.writestate.setReadsEnabled(readsEnabled); } /** * Add updates first to the wal and then add values to memstore. * Warning: Assumption is caller has lock on passed in row. * @param edits Cell updates by column * @throws IOException */ void put(final byte [] row, byte [] family, List edits) throws IOException { NavigableMap> familyMap; familyMap = new TreeMap<>(Bytes.BYTES_COMPARATOR); familyMap.put(family, edits); Put p = new Put(row); p.setFamilyCellMap(familyMap); doBatchMutate(p); } /** * @param delta If we are doing delta changes -- e.g. increment/append -- then this flag will be * set; when set we will run operations that make sense in the increment/append scenario * but that do not make sense otherwise. * @see #applyToMemStore(HStore, Cell, MemStoreSizing) */ private void applyToMemStore(HStore store, List cells, boolean delta, MemStoreSizing memstoreAccounting) throws IOException { // Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!! boolean upsert = delta && store.getColumnFamilyDescriptor().getMaxVersions() == 1; if (upsert) { store.upsert(cells, getSmallestReadPoint(), memstoreAccounting); } else { store.add(cells, memstoreAccounting); } } /** * @see #applyToMemStore(HStore, List, boolean, MemStoreSizing) */ private void applyToMemStore(HStore store, Cell cell, MemStoreSizing memstoreAccounting) throws IOException { // Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!! if (store == null) { checkFamily(CellUtil.cloneFamily(cell)); // Unreachable because checkFamily will throw exception } store.add(cell, memstoreAccounting); } /** * Check the collection of families for validity. * @param families * @throws NoSuchColumnFamilyException */ public void checkFamilies(Collection families) throws NoSuchColumnFamilyException { for (byte[] family : families) { checkFamily(family); } } /** * Check the collection of families for valid timestamps * @param familyMap * @param now current timestamp * @throws FailedSanityCheckException */ public void checkTimestamps(final Map> familyMap, long now) throws FailedSanityCheckException { if (timestampSlop == HConstants.LATEST_TIMESTAMP) { return; } long maxTs = now + timestampSlop; for (List kvs : familyMap.values()) { // Optimization: 'foreach' loop is not used. See: // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects assert kvs instanceof RandomAccess; int listSize = kvs.size(); for (int i=0; i < listSize; i++) { Cell cell = kvs.get(i); // see if the user-side TS is out of range. latest = server-side long ts = cell.getTimestamp(); if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) { throw new FailedSanityCheckException("Timestamp for KV out of range " + cell + " (too.new=" + timestampSlop + ")"); } } } } /* * @param size * @return True if size is over the flush threshold */ private boolean isFlushSize(MemStoreSize size) { return size.getHeapSize() + size.getOffHeapSize() > getMemStoreFlushSize(); } private void deleteRecoveredEdits(FileSystem fs, Iterable files) throws IOException { for (Path file : files) { if (!fs.delete(file, false)) { LOG.error("Failed delete of {}", file); } else { LOG.debug("Deleted recovered.edits file={}", file); } } } /** * Read the edits put under this region by wal splitting process. Put * the recovered edits back up into this region. * *

We can ignore any wal message that has a sequence ID that's equal to or * lower than minSeqId. (Because we know such messages are already * reflected in the HFiles.) * *

While this is running we are putting pressure on memory yet we are * outside of our usual accounting because we are not yet an onlined region * (this stuff is being run as part of Region initialization). This means * that if we're up against global memory limits, we'll not be flagged to flush * because we are not online. We can't be flushed by usual mechanisms anyways; * we're not yet online so our relative sequenceids are not yet aligned with * WAL sequenceids -- not till we come up online, post processing of split * edits. * *

But to help relieve memory pressure, at least manage our own heap size * flushing if are in excess of per-region limits. Flushing, though, we have * to be careful and avoid using the regionserver/wal sequenceid. Its running * on a different line to whats going on in here in this region context so if we * crashed replaying these edits, but in the midst had a flush that used the * regionserver wal with a sequenceid in excess of whats going on in here * in this region and with its split editlogs, then we could miss edits the * next time we go to recover. So, we have to flush inline, using seqids that * make sense in a this single region context only -- until we online. * * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of * the maxSeqId for the store to be applied, else its skipped. * @return the sequence id of the last edit added to this region out of the * recovered edits log or minSeqId if nothing added from editlogs. */ @VisibleForTesting long replayRecoveredEditsIfAny(Map maxSeqIdInStores, final CancelableProgressable reporter, final MonitoredTask status) throws IOException { long minSeqIdForTheRegion = -1; for (Long maxSeqIdInStore : maxSeqIdInStores.values()) { if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) { minSeqIdForTheRegion = maxSeqIdInStore; } } long seqId = minSeqIdForTheRegion; String specialRecoveredEditsDirStr = conf.get(SPECIAL_RECOVERED_EDITS_DIR); if (org.apache.commons.lang3.StringUtils.isBlank(specialRecoveredEditsDirStr)) { FileSystem walFS = getWalFileSystem(); FileSystem rootFS = getFilesystem(); Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf, getRegionInfo().getTable(), getRegionInfo().getEncodedName()); Path regionWALDir = getWALRegionDir(); Path regionDir = FSUtils.getRegionDirFromRootDir(CommonFSUtils.getRootDir(conf), getRegionInfo()); // We made a mistake in HBASE-20734 so we need to do this dirty hack... NavigableSet filesUnderWrongRegionWALDir = WALSplitUtil.getSplitEditFilesSorted(walFS, wrongRegionWALDir); seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS, filesUnderWrongRegionWALDir, reporter, regionDir)); // This is to ensure backwards compatability with HBASE-20723 where recovered edits can appear // under the root dir even if walDir is set. NavigableSet filesUnderRootDir = Collections.emptyNavigableSet(); if (!regionWALDir.equals(regionDir)) { filesUnderRootDir = WALSplitUtil.getSplitEditFilesSorted(rootFS, regionDir); seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, rootFS, filesUnderRootDir, reporter, regionDir)); } NavigableSet files = WALSplitUtil.getSplitEditFilesSorted(walFS, regionWALDir); seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS, files, reporter, regionWALDir)); if (seqId > minSeqIdForTheRegion) { // Then we added some edits to memory. Flush and cleanup split edit files. internalFlushcache(null, seqId, stores.values(), status, false, FlushLifeCycleTracker.DUMMY); } // Now delete the content of recovered edits. We're done w/ them. if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) { // For debugging data loss issues! // If this flag is set, make use of the hfile archiving by making recovered.edits a fake // column family. Have to fake out file type too by casting our recovered.edits as // storefiles String fakeFamilyName = WALSplitUtil.getRegionDirRecoveredEditsDir(regionWALDir).getName(); Set fakeStoreFiles = new HashSet<>(files.size()); for (Path file : files) { fakeStoreFiles.add(new HStoreFile(walFS, file, this.conf, null, null, true)); } getRegionWALFileSystem().archiveRecoveredEdits(fakeFamilyName, fakeStoreFiles); } else { deleteRecoveredEdits(walFS, Iterables.concat(files, filesUnderWrongRegionWALDir)); deleteRecoveredEdits(rootFS, filesUnderRootDir); } } else { Path recoveredEditsDir = new Path(specialRecoveredEditsDirStr); FileSystem fs = recoveredEditsDir.getFileSystem(conf); FileStatus[] files = fs.listStatus(recoveredEditsDir); LOG.debug("Found {} recovered edits file(s) under {}", files == null ? 0 : files.length, recoveredEditsDir); if (files != null) { for (FileStatus file : files) { seqId = Math.max(seqId, replayRecoveredEdits(file.getPath(), maxSeqIdInStores, reporter, fs)); } } if (seqId > minSeqIdForTheRegion) { // Then we added some edits to memory. Flush and cleanup split edit files. internalFlushcache(null, seqId, stores.values(), status, false, FlushLifeCycleTracker.DUMMY); } deleteRecoveredEdits(fs, Stream.of(files).map(FileStatus::getPath).collect(Collectors.toList())); } return seqId; } private long replayRecoveredEditsForPaths(long minSeqIdForTheRegion, FileSystem fs, final NavigableSet files, final CancelableProgressable reporter, final Path regionDir) throws IOException { long seqid = minSeqIdForTheRegion; if (LOG.isDebugEnabled()) { LOG.debug("Found " + (files == null ? 0 : files.size()) + " recovered edits file(s) under " + regionDir); } if (files == null || files.isEmpty()) { return minSeqIdForTheRegion; } for (Path edits: files) { if (edits == null || !fs.exists(edits)) { LOG.warn("Null or non-existent edits file: " + edits); continue; } if (isZeroLengthThenDelete(fs, fs.getFileStatus(edits), edits)) { continue; } long maxSeqId; String fileName = edits.getName(); maxSeqId = Math.abs(Long.parseLong(fileName)); if (maxSeqId <= minSeqIdForTheRegion) { if (LOG.isDebugEnabled()) { String msg = "Maximum sequenceid for this wal is " + maxSeqId + " and minimum sequenceid for the region " + this + " is " + minSeqIdForTheRegion + ", skipped the whole file, path=" + edits; LOG.debug(msg); } continue; } try { // replay the edits. Replay can return -1 if everything is skipped, only update // if seqId is greater seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter, fs)); } catch (IOException e) { handleException(fs, edits, e); } } return seqid; } private void handleException(FileSystem fs, Path edits, IOException e) throws IOException { boolean skipErrors = conf.getBoolean(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS, conf.getBoolean("hbase.skip.errors", HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS)); if (conf.get("hbase.skip.errors") != null) { LOG.warn("The property 'hbase.skip.errors' has been deprecated. Please use " + HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead."); } if (skipErrors) { Path p = WALSplitUtil.moveAsideBadEditsFile(fs, edits); LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + "=true so continuing. Renamed " + edits + " as " + p, e); } else { throw e; } } /** * @param edits File of recovered edits. * @param maxSeqIdInStores Maximum sequenceid found in each store. Edits in wal must be larger * than this to be replayed for each store. * @return the sequence id of the last edit added to this region out of the recovered edits log or * minSeqId if nothing added from editlogs. */ private long replayRecoveredEdits(final Path edits, Map maxSeqIdInStores, final CancelableProgressable reporter, FileSystem fs) throws IOException { String msg = "Replaying edits from " + edits; LOG.info(msg); MonitoredTask status = TaskMonitor.get().createStatus(msg); status.setStatus("Opening recovered edits"); WAL.Reader reader = null; try { reader = WALFactory.createReader(fs, edits, conf); long currentEditSeqId = -1; long currentReplaySeqId = -1; long firstSeqIdInLog = -1; long skippedEdits = 0; long editsCount = 0; long intervalEdits = 0; WAL.Entry entry; HStore store = null; boolean reported_once = false; ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager(); try { // How many edits seen before we check elapsed time int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000); // How often to send a progress report (default 1/2 master timeout) int period = this.conf.getInt("hbase.hstore.report.period", 300000); long lastReport = EnvironmentEdgeManager.currentTime(); if (coprocessorHost != null) { coprocessorHost.preReplayWALs(this.getRegionInfo(), edits); } while ((entry = reader.next()) != null) { WALKey key = entry.getKey(); WALEdit val = entry.getEdit(); if (ng != null) { // some test, or nonces disabled ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime()); } if (reporter != null) { intervalEdits += val.size(); if (intervalEdits >= interval) { // Number of edits interval reached intervalEdits = 0; long cur = EnvironmentEdgeManager.currentTime(); if (lastReport + period <= cur) { status.setStatus("Replaying edits..." + " skipped=" + skippedEdits + " edits=" + editsCount); // Timeout reached if(!reporter.progress()) { msg = "Progressable reporter failed, stopping replay for region " + this; LOG.warn(msg); status.abort(msg); throw new IOException(msg); } reported_once = true; lastReport = cur; } } } if (firstSeqIdInLog == -1) { firstSeqIdInLog = key.getSequenceId(); } if (currentEditSeqId > key.getSequenceId()) { // when this condition is true, it means we have a serious defect because we need to // maintain increasing SeqId for WAL edits per region LOG.error(getRegionInfo().getEncodedName() + " : " + "Found decreasing SeqId. PreId=" + currentEditSeqId + " key=" + key + "; edit=" + val); } else { currentEditSeqId = key.getSequenceId(); } currentReplaySeqId = (key.getOrigLogSeqNum() > 0) ? key.getOrigLogSeqNum() : currentEditSeqId; // Start coprocessor replay here. The coprocessor is for each WALEdit // instead of a KeyValue. if (coprocessorHost != null) { status.setStatus("Running pre-WAL-restore hook in coprocessors"); if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) { // if bypass this wal entry, ignore it ... continue; } } boolean checkRowWithinBoundary = false; // Check this edit is for this region. if (!Bytes.equals(key.getEncodedRegionName(), this.getRegionInfo().getEncodedNameAsBytes())) { checkRowWithinBoundary = true; } boolean flush = false; MemStoreSizing memStoreSizing = new NonThreadSafeMemStoreSizing(); for (Cell cell: val.getCells()) { // Check this edit is for me. Also, guard against writing the special // METACOLUMN info such as HBASE::CACHEFLUSH entries if (WALEdit.isMetaEditFamily(cell)) { // if region names don't match, skipp replaying compaction marker if (!checkRowWithinBoundary) { //this is a special edit, we should handle it CompactionDescriptor compaction = WALEdit.getCompaction(cell); if (compaction != null) { //replay the compaction replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE); } } skippedEdits++; continue; } // Figure which store the edit is meant for. if (store == null || !CellUtil.matchingFamily(cell, store.getColumnFamilyDescriptor().getName())) { store = getStore(cell); } if (store == null) { // This should never happen. Perhaps schema was changed between // crash and redeploy? LOG.warn("No family for cell {} in region {}", cell, this); skippedEdits++; continue; } if (checkRowWithinBoundary && !rowIsInRange(this.getRegionInfo(), cell.getRowArray(), cell.getRowOffset(), cell.getRowLength())) { LOG.warn("Row of {} is not within region boundary for region {}", cell, this); skippedEdits++; continue; } // Now, figure if we should skip this edit. if (key.getSequenceId() <= maxSeqIdInStores.get(store.getColumnFamilyDescriptor() .getName())) { skippedEdits++; continue; } PrivateCellUtil.setSequenceId(cell, currentReplaySeqId); restoreEdit(store, cell, memStoreSizing); editsCount++; } MemStoreSize mss = memStoreSizing.getMemStoreSize(); incMemStoreSize(mss); flush = isFlushSize(this.memStoreSizing.getMemStoreSize()); if (flush) { internalFlushcache(null, currentEditSeqId, stores.values(), status, false, FlushLifeCycleTracker.DUMMY); } if (coprocessorHost != null) { coprocessorHost.postWALRestore(this.getRegionInfo(), key, val); } } if (coprocessorHost != null) { coprocessorHost.postReplayWALs(this.getRegionInfo(), edits); } } catch (EOFException eof) { Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits); msg = "EnLongAddered EOF. Most likely due to Master failure during " + "wal splitting, so we have this data in another edit. Continuing, but renaming " + edits + " as " + p + " for region " + this; LOG.warn(msg, eof); status.abort(msg); } catch (IOException ioe) { // If the IOE resulted from bad file format, // then this problem is idempotent and retrying won't help if (ioe.getCause() instanceof ParseException) { Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits); msg = "File corruption enLongAddered! " + "Continuing, but renaming " + edits + " as " + p; LOG.warn(msg, ioe); status.setStatus(msg); } else { status.abort(StringUtils.stringifyException(ioe)); // other IO errors may be transient (bad network connection, // checksum exception on one datanode, etc). throw & retry throw ioe; } } if (reporter != null && !reported_once) { reporter.progress(); } msg = "Applied " + editsCount + ", skipped " + skippedEdits + ", firstSequenceIdInLog=" + firstSeqIdInLog + ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits; status.markComplete(msg); LOG.debug(msg); return currentEditSeqId; } finally { status.cleanup(); if (reader != null) { reader.close(); } } } /** * Call to complete a compaction. Its for the case where we find in the WAL a compaction * that was not finished. We could find one recovering a WAL after a regionserver crash. * See HBASE-2331. */ void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles, boolean removeFiles, long replaySeqId) throws IOException { try { checkTargetRegion(compaction.getEncodedRegionName().toByteArray(), "Compaction marker from WAL ", compaction); } catch (WrongRegionException wre) { if (RegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { // skip the compaction marker since it is not for this region return; } throw wre; } synchronized (writestate) { if (replaySeqId < lastReplayedOpenRegionSeqId) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction) + " because its sequence id " + replaySeqId + " is smaller than this regions " + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId); return; } if (replaySeqId < lastReplayedCompactionSeqId) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction) + " because its sequence id " + replaySeqId + " is smaller than this regions " + "lastReplayedCompactionSeqId of " + lastReplayedCompactionSeqId); return; } else { lastReplayedCompactionSeqId = replaySeqId; } if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying compaction marker " + TextFormat.shortDebugString(compaction) + " with seqId=" + replaySeqId + " and lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId); } startRegionOperation(Operation.REPLAY_EVENT); try { HStore store = this.getStore(compaction.getFamilyName().toByteArray()); if (store == null) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Found Compaction WAL edit for deleted family:" + Bytes.toString(compaction.getFamilyName().toByteArray())); return; } store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles); logRegionFiles(); } catch (FileNotFoundException ex) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "At least one of the store files in compaction: " + TextFormat.shortDebugString(compaction) + " doesn't exist any more. Skip loading the file(s)", ex); } finally { closeRegionOperation(Operation.REPLAY_EVENT); } } } void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException { checkTargetRegion(flush.getEncodedRegionName().toByteArray(), "Flush marker from WAL ", flush); if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { return; // if primary nothing to do } if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying flush marker " + TextFormat.shortDebugString(flush)); } startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close try { FlushAction action = flush.getAction(); switch (action) { case START_FLUSH: replayWALFlushStartMarker(flush); break; case COMMIT_FLUSH: replayWALFlushCommitMarker(flush); break; case ABORT_FLUSH: replayWALFlushAbortMarker(flush); break; case CANNOT_FLUSH: replayWALFlushCannotFlushMarker(flush, replaySeqId); break; default: LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush event with unknown action, ignoring. " + TextFormat.shortDebugString(flush)); break; } logRegionFiles(); } finally { closeRegionOperation(Operation.REPLAY_EVENT); } } /** Replay the flush marker from primary region by creating a corresponding snapshot of * the store memstores, only if the memstores do not have a higher seqId from an earlier wal * edit (because the events may be coming out of order). */ @VisibleForTesting PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException { long flushSeqId = flush.getFlushSequenceNumber(); HashSet storesToFlush = new HashSet<>(); for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) { byte[] family = storeFlush.getFamilyName().toByteArray(); HStore store = getStore(family); if (store == null) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush start marker from primary, but the family is not found. Ignoring" + " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush)); continue; } storesToFlush.add(store); } MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this); // we will use writestate as a coarse-grain lock for all the replay events // (flush, compaction, region open etc) synchronized (writestate) { try { if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush) + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " + " of " + lastReplayedOpenRegionSeqId); return null; } if (numMutationsWithoutWAL.sum() > 0) { numMutationsWithoutWAL.reset(); dataInMemoryWithoutWAL.reset(); } if (!writestate.flushing) { // we do not have an active snapshot and corresponding this.prepareResult. This means // we can just snapshot our memstores and continue as normal. // invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal PrepareFlushResult prepareResult = internalPrepareFlushCache(null, flushSeqId, storesToFlush, status, false, FlushLifeCycleTracker.DUMMY); if (prepareResult.result == null) { // save the PrepareFlushResult so that we can use it later from commit flush this.writestate.flushing = true; this.prepareFlushResult = prepareResult; status.markComplete("Flush prepare successful"); if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + " Prepared flush with seqId:" + flush.getFlushSequenceNumber()); } } else { // special case empty memstore. We will still save the flush result in this case, since // our memstore ie empty, but the primary is still flushing if (prepareResult.getResult().getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) { this.writestate.flushing = true; this.prepareFlushResult = prepareResult; if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber()); } } status.abort("Flush prepare failed with " + prepareResult.result); // nothing much to do. prepare flush failed because of some reason. } return prepareResult; } else { // we already have an active snapshot. if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) { // They define the same flush. Log and continue. LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush prepare marker with the same seqId: " + + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " + prepareFlushResult.flushOpSeqId + ". Ignoring"); // ignore } else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) { // We received a flush with a smaller seqNum than what we have prepared. We can only // ignore this prepare flush request. LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush prepare marker with a smaller seqId: " + + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " + prepareFlushResult.flushOpSeqId + ". Ignoring"); // ignore } else { // We received a flush with a larger seqNum than what we have prepared LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush prepare marker with a larger seqId: " + + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " + prepareFlushResult.flushOpSeqId + ". Ignoring"); // We do not have multiple active snapshots in the memstore or a way to merge current // memstore snapshot with the contents and resnapshot for now. We cannot take // another snapshot and drop the previous one because that will cause temporary // data loss in the secondary. So we ignore this for now, deferring the resolution // to happen when we see the corresponding flush commit marker. If we have a memstore // snapshot with x, and later received another prepare snapshot with y (where x < y), // when we see flush commit for y, we will drop snapshot for x, and can also drop all // the memstore edits if everything in memstore is < y. This is the usual case for // RS crash + recovery where we might see consequtive prepare flush wal markers. // Otherwise, this will cause more memory to be used in secondary replica until a // further prapare + commit flush is seen and replayed. } } } finally { status.cleanup(); writestate.notifyAll(); } } return null; } @VisibleForTesting @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY", justification="Intentional; post memstore flush") void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException { MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this); // check whether we have the memstore snapshot with the corresponding seqId. Replay to // secondary region replicas are in order, except for when the region moves or then the // region server crashes. In those cases, we may receive replay requests out of order from // the original seqIds. synchronized (writestate) { try { if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush) + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " + " of " + lastReplayedOpenRegionSeqId); return; } if (writestate.flushing) { PrepareFlushResult prepareFlushResult = this.prepareFlushResult; if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) { if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber() + " and a previous prepared snapshot was found"); } // This is the regular case where we received commit flush after prepare flush // corresponding to the same seqId. replayFlushInStores(flush, prepareFlushResult, true); // Set down the memstore size by amount of flush. this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize()); this.prepareFlushResult = null; writestate.flushing = false; } else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) { // This should not happen normally. However, lets be safe and guard against these cases // we received a flush commit with a smaller seqId than what we have prepared // we will pick the flush file up from this commit (if we have not seen it), but we // will not drop the memstore LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker with smaller seqId: " + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " + prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping" +" prepared memstore snapshot"); replayFlushInStores(flush, prepareFlushResult, false); // snapshot is not dropped, so memstore sizes should not be decremented // we still have the prepared snapshot, flushing should still be true } else { // This should not happen normally. However, lets be safe and guard against these cases // we received a flush commit with a larger seqId than what we have prepared // we will pick the flush file for this. We will also obtain the updates lock and // look for contents of the memstore to see whether we have edits after this seqId. // If not, we will drop all the memstore edits and the snapshot as well. LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker with larger seqId: " + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " + prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared" +" memstore snapshot"); replayFlushInStores(flush, prepareFlushResult, true); // Set down the memstore size by amount of flush. this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize()); // Inspect the memstore contents to see whether the memstore contains only edits // with seqId smaller than the flush seqId. If so, we can discard those edits. dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null); this.prepareFlushResult = null; writestate.flushing = false; } // If we were waiting for observing a flush or region opening event for not showing // partial data after a secondary region crash, we can allow reads now. We can only make // sure that we are not showing partial data (for example skipping some previous edits) // until we observe a full flush start and flush commit. So if we were not able to find // a previous flush we will not enable reads now. this.setReadsEnabled(true); } else { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber() + ", but no previous prepared snapshot was found"); // There is no corresponding prepare snapshot from before. // We will pick up the new flushed file replayFlushInStores(flush, null, false); // Inspect the memstore contents to see whether the memstore contains only edits // with seqId smaller than the flush seqId. If so, we can discard those edits. dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null); } status.markComplete("Flush commit successful"); // Update the last flushed sequence id for region. this.maxFlushedSeqId = flush.getFlushSequenceNumber(); // advance the mvcc read point so that the new flushed file is visible. mvcc.advanceTo(flush.getFlushSequenceNumber()); } catch (FileNotFoundException ex) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "At least one of the store files in flush: " + TextFormat.shortDebugString(flush) + " doesn't exist any more. Skip loading the file(s)", ex); } finally { status.cleanup(); writestate.notifyAll(); } } // C. Finally notify anyone waiting on memstore to clear: // e.g. checkResources(). synchronized (this) { notifyAll(); // FindBugs NN_NAKED_NOTIFY } } /** * Replays the given flush descriptor by opening the flush files in stores and dropping the * memstore snapshots if requested. * @param flush * @param prepareFlushResult * @param dropMemstoreSnapshot * @throws IOException */ private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult, boolean dropMemstoreSnapshot) throws IOException { for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) { byte[] family = storeFlush.getFamilyName().toByteArray(); HStore store = getStore(family); if (store == null) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker from primary, but the family is not found." + "Ignoring StoreFlushDescriptor:" + storeFlush); continue; } List flushFiles = storeFlush.getFlushOutputList(); StoreFlushContext ctx = null; long startTime = EnvironmentEdgeManager.currentTime(); if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) { ctx = store.createFlushContext(flush.getFlushSequenceNumber(), FlushLifeCycleTracker.DUMMY); } else { ctx = prepareFlushResult.storeFlushCtxs.get(family); startTime = prepareFlushResult.startTime; } if (ctx == null) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Unexpected: flush commit marker received from store " + Bytes.toString(family) + " but no associated flush context. Ignoring"); continue; } ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush // Record latest flush time this.lastStoreFlushTimeMap.put(store, startTime); } } private long loadRecoveredHFilesIfAny(Collection stores) throws IOException { Path regionDir = fs.getRegionDir(); long maxSeqId = -1; for (HStore store : stores) { String familyName = store.getColumnFamilyName(); FileStatus[] files = WALSplitUtil.getRecoveredHFiles(fs.getFileSystem(), regionDir, familyName); if (files != null && files.length != 0) { for (FileStatus file : files) { Path filePath = file.getPath(); // If file length is zero then delete it if (isZeroLengthThenDelete(fs.getFileSystem(), file, filePath)) { continue; } try { HStoreFile storefile = store.tryCommitRecoveredHFile(file.getPath()); maxSeqId = Math.max(maxSeqId, storefile.getReader().getSequenceID()); } catch (IOException e) { handleException(fs.getFileSystem(), filePath, e); continue; } } if (this.rsServices != null && store.needsCompaction()) { this.rsServices.getCompactionRequestor() .requestCompaction(this, store, "load recovered hfiles request compaction", Store.PRIORITY_USER + 1, CompactionLifeCycleTracker.DUMMY, null); } } } return maxSeqId; } /** * Be careful, this method will drop all data in the memstore of this region. * Currently, this method is used to drop memstore to prevent memory leak * when replaying recovered.edits while opening region. */ public MemStoreSize dropMemStoreContents() throws IOException { MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing(); this.updatesLock.writeLock().lock(); try { for (HStore s : stores.values()) { MemStoreSize memStoreSize = doDropStoreMemStoreContentsForSeqId(s, HConstants.NO_SEQNUM); LOG.info("Drop memstore for Store " + s.getColumnFamilyName() + " in region " + this.getRegionInfo().getRegionNameAsString() + " , dropped memstoresize: [" + memStoreSize + " }"); totalFreedSize.incMemStoreSize(memStoreSize); } return totalFreedSize.getMemStoreSize(); } finally { this.updatesLock.writeLock().unlock(); } } /** * Drops the memstore contents after replaying a flush descriptor or region open event replay * if the memstore edits have seqNums smaller than the given seq id * @throws IOException */ private MemStoreSize dropMemStoreContentsForSeqId(long seqId, HStore store) throws IOException { MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing(); this.updatesLock.writeLock().lock(); try { long currentSeqId = mvcc.getReadPoint(); if (seqId >= currentSeqId) { // then we can drop the memstore contents since everything is below this seqId LOG.info(getRegionInfo().getEncodedName() + " : " + "Dropping memstore contents as well since replayed flush seqId: " + seqId + " is greater than current seqId:" + currentSeqId); // Prepare flush (take a snapshot) and then abort (drop the snapshot) if (store == null) { for (HStore s : stores.values()) { totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(s, currentSeqId)); } } else { totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(store, currentSeqId)); } } else { LOG.info(getRegionInfo().getEncodedName() + " : " + "Not dropping memstore contents since replayed flush seqId: " + seqId + " is smaller than current seqId:" + currentSeqId); } } finally { this.updatesLock.writeLock().unlock(); } return totalFreedSize.getMemStoreSize(); } private MemStoreSize doDropStoreMemStoreContentsForSeqId(HStore s, long currentSeqId) throws IOException { MemStoreSize flushableSize = s.getFlushableSize(); this.decrMemStoreSize(flushableSize); StoreFlushContext ctx = s.createFlushContext(currentSeqId, FlushLifeCycleTracker.DUMMY); ctx.prepare(); ctx.abort(); return flushableSize; } private void replayWALFlushAbortMarker(FlushDescriptor flush) { // nothing to do for now. A flush abort will cause a RS abort which means that the region // will be opened somewhere else later. We will see the region open event soon, and replaying // that will drop the snapshot } private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) { synchronized (writestate) { if (this.lastReplayedOpenRegionSeqId > replaySeqId) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush) + " because its sequence id " + replaySeqId + " is smaller than this regions " + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId); return; } // If we were waiting for observing a flush or region opening event for not showing partial // data after a secondary region crash, we can allow reads now. This event means that the // primary was not able to flush because memstore is empty when we requested flush. By the // time we observe this, we are guaranteed to have up to date seqId with our previous // assignment. this.setReadsEnabled(true); } } @VisibleForTesting PrepareFlushResult getPrepareFlushResult() { return prepareFlushResult; } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY", justification="Intentional; cleared the memstore") void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException { checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(), "RegionEvent marker from WAL ", regionEvent); startRegionOperation(Operation.REPLAY_EVENT); try { if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { return; // if primary nothing to do } if (regionEvent.getEventType() == EventType.REGION_CLOSE) { // nothing to do on REGION_CLOSE for now. return; } if (regionEvent.getEventType() != EventType.REGION_OPEN) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Unknown region event received, ignoring :" + TextFormat.shortDebugString(regionEvent)); return; } if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying region open event marker " + TextFormat.shortDebugString(regionEvent)); } // we will use writestate as a coarse-grain lock for all the replay events synchronized (writestate) { // Replication can deliver events out of order when primary region moves or the region // server crashes, since there is no coordination between replication of different wal files // belonging to different region servers. We have to safe guard against this case by using // region open event's seqid. Since this is the first event that the region puts (after // possibly flushing recovered.edits), after seeing this event, we can ignore every edit // smaller than this seqId if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) { this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber(); } else { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying region event :" + TextFormat.shortDebugString(regionEvent) + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " + " of " + lastReplayedOpenRegionSeqId); return; } // region open lists all the files that the region has at the time of the opening. Just pick // all the files and drop prepared flushes and empty memstores for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) { // stores of primary may be different now byte[] family = storeDescriptor.getFamilyName().toByteArray(); HStore store = getStore(family); if (store == null) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a region open marker from primary, but the family is not found. " + "Ignoring. StoreDescriptor:" + storeDescriptor); continue; } long storeSeqId = store.getMaxSequenceId().orElse(0L); List storeFiles = storeDescriptor.getStoreFileList(); try { store.refreshStoreFiles(storeFiles); // replace the files with the new ones } catch (FileNotFoundException ex) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "At least one of the store files: " + storeFiles + " doesn't exist any more. Skip loading the file(s)", ex); continue; } if (store.getMaxSequenceId().orElse(0L) != storeSeqId) { // Record latest flush time if we picked up new files lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime()); } if (writestate.flushing) { // only drop memstore snapshots if they are smaller than last flush for the store if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) { StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ? null : this.prepareFlushResult.storeFlushCtxs.get(family); if (ctx != null) { MemStoreSize mss = store.getFlushableSize(); ctx.abort(); this.decrMemStoreSize(mss); this.prepareFlushResult.storeFlushCtxs.remove(family); } } } // Drop the memstore contents if they are now smaller than the latest seen flushed file dropMemStoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store); if (storeSeqId > this.maxFlushedSeqId) { this.maxFlushedSeqId = storeSeqId; } } // if all stores ended up dropping their snapshots, we can safely drop the // prepareFlushResult dropPrepareFlushIfPossible(); // advance the mvcc read point so that the new flushed file is visible. mvcc.await(); // If we were waiting for observing a flush or region opening event for not showing partial // data after a secondary region crash, we can allow reads now. this.setReadsEnabled(true); // C. Finally notify anyone waiting on memstore to clear: // e.g. checkResources(). synchronized (this) { notifyAll(); // FindBugs NN_NAKED_NOTIFY } } logRegionFiles(); } finally { closeRegionOperation(Operation.REPLAY_EVENT); } } void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException { checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(), "BulkLoad marker from WAL ", bulkLoadEvent); if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { return; // if primary nothing to do } if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying bulkload event marker " + TextFormat.shortDebugString(bulkLoadEvent)); } // check if multiple families involved boolean multipleFamilies = false; byte[] family = null; for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) { byte[] fam = storeDescriptor.getFamilyName().toByteArray(); if (family == null) { family = fam; } else if (!Bytes.equals(family, fam)) { multipleFamilies = true; break; } } startBulkRegionOperation(multipleFamilies); try { // we will use writestate as a coarse-grain lock for all the replay events synchronized (writestate) { // Replication can deliver events out of order when primary region moves or the region // server crashes, since there is no coordination between replication of different wal files // belonging to different region servers. We have to safe guard against this case by using // region open event's seqid. Since this is the first event that the region puts (after // possibly flushing recovered.edits), after seeing this event, we can ignore every edit // smaller than this seqId if (bulkLoadEvent.getBulkloadSeqNum() >= 0 && this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum()) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying bulkload event :" + TextFormat.shortDebugString(bulkLoadEvent) + " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId" + " =" + lastReplayedOpenRegionSeqId); return; } for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) { // stores of primary may be different now family = storeDescriptor.getFamilyName().toByteArray(); HStore store = getStore(family); if (store == null) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "Received a bulk load marker from primary, but the family is not found. " + "Ignoring. StoreDescriptor:" + storeDescriptor); continue; } List storeFiles = storeDescriptor.getStoreFileList(); for (String storeFile : storeFiles) { StoreFileInfo storeFileInfo = null; try { storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile); store.bulkLoadHFile(storeFileInfo); } catch(FileNotFoundException ex) { LOG.warn(getRegionInfo().getEncodedName() + " : " + ((storeFileInfo != null) ? storeFileInfo.toString() : (new Path(Bytes.toString(family), storeFile)).toString()) + " doesn't exist any more. Skip loading the file"); } } } } if (bulkLoadEvent.getBulkloadSeqNum() > 0) { mvcc.advanceTo(bulkLoadEvent.getBulkloadSeqNum()); } } finally { closeBulkRegionOperation(); } } /** * If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult */ private void dropPrepareFlushIfPossible() { if (writestate.flushing) { boolean canDrop = true; if (prepareFlushResult.storeFlushCtxs != null) { for (Entry entry : prepareFlushResult.storeFlushCtxs .entrySet()) { HStore store = getStore(entry.getKey()); if (store == null) { continue; } if (store.getSnapshotSize().getDataSize() > 0) { canDrop = false; break; } } } // this means that all the stores in the region has finished flushing, but the WAL marker // may not have been written or we did not receive it yet. if (canDrop) { writestate.flushing = false; this.prepareFlushResult = null; } } } @Override public boolean refreshStoreFiles() throws IOException { return refreshStoreFiles(false); } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", justification = "Notify is about post replay. Intentional") protected boolean refreshStoreFiles(boolean force) throws IOException { if (!force && ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { return false; // if primary nothing to do } if (LOG.isDebugEnabled()) { LOG.debug(getRegionInfo().getEncodedName() + " : " + "Refreshing store files to see whether we can free up memstore"); } long totalFreedDataSize = 0; long smallestSeqIdInStores = Long.MAX_VALUE; startRegionOperation(); // obtain region close lock try { Map map = new HashMap<>(); synchronized (writestate) { for (HStore store : stores.values()) { // TODO: some stores might see new data from flush, while others do not which // MIGHT break atomic edits across column families. long maxSeqIdBefore = store.getMaxSequenceId().orElse(0L); // refresh the store files. This is similar to observing a region open wal marker. store.refreshStoreFiles(); long storeSeqId = store.getMaxSequenceId().orElse(0L); if (storeSeqId < smallestSeqIdInStores) { smallestSeqIdInStores = storeSeqId; } // see whether we can drop the memstore or the snapshot if (storeSeqId > maxSeqIdBefore) { if (writestate.flushing) { // only drop memstore snapshots if they are smaller than last flush for the store if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) { StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ? null : this.prepareFlushResult.storeFlushCtxs.get( store.getColumnFamilyDescriptor().getName()); if (ctx != null) { MemStoreSize mss = store.getFlushableSize(); ctx.abort(); this.decrMemStoreSize(mss); this.prepareFlushResult.storeFlushCtxs. remove(store.getColumnFamilyDescriptor().getName()); totalFreedDataSize += mss.getDataSize(); } } } map.put(store, storeSeqId); } } // if all stores ended up dropping their snapshots, we can safely drop the // prepareFlushResult dropPrepareFlushIfPossible(); // advance the mvcc read point so that the new flushed files are visible. // either greater than flush seq number or they were already picked up via flush. for (HStore s : stores.values()) { mvcc.advanceTo(s.getMaxMemStoreTS().orElse(0L)); } // smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely // skip all edits that are to be replayed in the future with that has a smaller seqId // than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits // that we have picked the flush files for if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) { this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores; } } if (!map.isEmpty()) { for (Map.Entry entry : map.entrySet()) { // Drop the memstore contents if they are now smaller than the latest seen flushed file totalFreedDataSize += dropMemStoreContentsForSeqId(entry.getValue(), entry.getKey()) .getDataSize(); } } // C. Finally notify anyone waiting on memstore to clear: // e.g. checkResources(). synchronized (this) { notifyAll(); // FindBugs NN_NAKED_NOTIFY } return totalFreedDataSize > 0; } finally { closeRegionOperation(); } } private void logRegionFiles() { if (LOG.isTraceEnabled()) { LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: "); stores.values().stream().filter(s -> s.getStorefiles() != null) .flatMap(s -> s.getStorefiles().stream()) .forEachOrdered(sf -> LOG.trace(getRegionInfo().getEncodedName() + " : " + sf)); } } /** Checks whether the given regionName is either equal to our region, or that * the regionName is the primary region to our corresponding range for the secondary replica. */ private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload) throws WrongRegionException { if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) { return; } if (!RegionReplicaUtil.isDefaultReplica(this.getRegionInfo()) && Bytes.equals(encodedRegionName, this.fs.getRegionInfoForFS().getEncodedNameAsBytes())) { return; } throw new WrongRegionException(exceptionMsg + payload + " targetted for region " + Bytes.toStringBinary(encodedRegionName) + " does not match this region: " + this.getRegionInfo()); } /** * Used by tests * @param s Store to add edit too. * @param cell Cell to add. */ @VisibleForTesting protected void restoreEdit(HStore s, Cell cell, MemStoreSizing memstoreAccounting) { s.add(cell, memstoreAccounting); } /** * @param p File to check. * @return True if file was zero-length (and if so, we'll delete it in here). * @throws IOException */ private static boolean isZeroLengthThenDelete(final FileSystem fs, final FileStatus stat, final Path p) throws IOException { if (stat.getLen() > 0) { return false; } LOG.warn("File " + p + " is zero-length, deleting."); fs.delete(p, false); return true; } protected HStore instantiateHStore(final ColumnFamilyDescriptor family, boolean warmup) throws IOException { if (family.isMobEnabled()) { if (HFile.getFormatVersion(this.conf) < HFile.MIN_FORMAT_VERSION_WITH_TAGS) { throw new IOException("A minimum HFile version of " + HFile.MIN_FORMAT_VERSION_WITH_TAGS + " is required for MOB feature. Consider setting " + HFile.FORMAT_VERSION_KEY + " accordingly."); } return new HMobStore(this, family, this.conf, warmup); } return new HStore(this, family, this.conf, warmup); } @Override public HStore getStore(byte[] column) { return this.stores.get(column); } /** * Return HStore instance. Does not do any copy: as the number of store is limited, we iterate on * the list. */ private HStore getStore(Cell cell) { return stores.entrySet().stream().filter(e -> CellUtil.matchingFamily(cell, e.getKey())) .map(e -> e.getValue()).findFirst().orElse(null); } @Override public List getStores() { return new ArrayList<>(stores.values()); } @Override public List getStoreFileList(byte[][] columns) throws IllegalArgumentException { List storeFileNames = new ArrayList<>(); synchronized (closeLock) { for (byte[] column : columns) { HStore store = this.stores.get(column); if (store == null) { throw new IllegalArgumentException( "No column family : " + new String(column, StandardCharsets.UTF_8) + " available"); } Collection storeFiles = store.getStorefiles(); if (storeFiles == null) { continue; } for (HStoreFile storeFile : storeFiles) { storeFileNames.add(storeFile.getPath().toString()); } logRegionFiles(); } } return storeFileNames; } ////////////////////////////////////////////////////////////////////////////// // Support code ////////////////////////////////////////////////////////////////////////////// /** Make sure this is a valid row for the HRegion */ void checkRow(byte[] row, String op) throws IOException { if (!rowIsInRange(getRegionInfo(), row)) { throw new WrongRegionException("Requested row out of range for " + op + " on HRegion " + this + ", startKey='" + Bytes.toStringBinary(getRegionInfo().getStartKey()) + "', getEndKey()='" + Bytes.toStringBinary(getRegionInfo().getEndKey()) + "', row='" + Bytes.toStringBinary(row) + "'"); } } /** * Get an exclusive ( write lock ) lock on a given row. * @param row Which row to lock. * @return A locked RowLock. The lock is exclusive and already aqquired. * @throws IOException */ public RowLock getRowLock(byte[] row) throws IOException { return getRowLock(row, false); } @Override public RowLock getRowLock(byte[] row, boolean readLock) throws IOException { checkRow(row, "row lock"); return getRowLockInternal(row, readLock, null); } protected RowLock getRowLockInternal(byte[] row, boolean readLock, final RowLock prevRowLock) throws IOException { // create an object to use a a key in the row lock map HashedBytes rowKey = new HashedBytes(row); RowLockContext rowLockContext = null; RowLockImpl result = null; boolean success = false; try (TraceScope scope = TraceUtil.createTrace("HRegion.getRowLock")) { TraceUtil.addTimelineAnnotation("Getting a " + (readLock?"readLock":"writeLock")); // Keep trying until we have a lock or error out. // TODO: do we need to add a time component here? while (result == null) { rowLockContext = computeIfAbsent(lockedRows, rowKey, () -> new RowLockContext(rowKey)); // Now try an get the lock. // This can fail as if (readLock) { // For read lock, if the caller has locked the same row previously, it will not try // to acquire the same read lock. It simply returns the previous row lock. RowLockImpl prevRowLockImpl = (RowLockImpl)prevRowLock; if ((prevRowLockImpl != null) && (prevRowLockImpl.getLock() == rowLockContext.readWriteLock.readLock())) { success = true; return prevRowLock; } result = rowLockContext.newReadLock(); } else { result = rowLockContext.newWriteLock(); } } int timeout = rowLockWaitDuration; boolean reachDeadlineFirst = false; Optional call = RpcServer.getCurrentCall(); if (call.isPresent()) { long deadline = call.get().getDeadline(); if (deadline < Long.MAX_VALUE) { int timeToDeadline = (int) (deadline - System.currentTimeMillis()); if (timeToDeadline <= this.rowLockWaitDuration) { reachDeadlineFirst = true; timeout = timeToDeadline; } } } if (timeout <= 0 || !result.getLock().tryLock(timeout, TimeUnit.MILLISECONDS)) { TraceUtil.addTimelineAnnotation("Failed to get row lock"); String message = "Timed out waiting for lock for row: " + rowKey + " in region " + getRegionInfo().getEncodedName(); if (reachDeadlineFirst) { throw new TimeoutIOException(message); } else { // If timeToDeadline is larger than rowLockWaitDuration, we can not drop the request. throw new IOException(message); } } rowLockContext.setThreadName(Thread.currentThread().getName()); success = true; return result; } catch (InterruptedException ie) { LOG.warn("Thread interrupted waiting for lock on row: {}, in region {}", rowKey, getRegionInfo().getRegionNameAsString()); InterruptedIOException iie = new InterruptedIOException(); iie.initCause(ie); TraceUtil.addTimelineAnnotation("Interrupted exception getting row lock"); Thread.currentThread().interrupt(); throw iie; } catch (Error error) { // The maximum lock count for read lock is 64K (hardcoded), when this maximum count // is reached, it will throw out an Error. This Error needs to be caught so it can // go ahead to process the minibatch with lock acquired. LOG.warn("Error to get row lock for {}, in region {}, cause: {}", Bytes.toStringBinary(row), getRegionInfo().getRegionNameAsString(), error); IOException ioe = new IOException(error); TraceUtil.addTimelineAnnotation("Error getting row lock"); throw ioe; } finally { // Clean up the counts just in case this was the thing keeping the context alive. if (!success && rowLockContext != null) { rowLockContext.cleanUp(); } } } private void releaseRowLocks(List rowLocks) { if (rowLocks != null) { for (RowLock rowLock : rowLocks) { rowLock.release(); } rowLocks.clear(); } } @VisibleForTesting public int getReadLockCount() { return lock.getReadLockCount(); } public ConcurrentHashMap getLockedRows() { return lockedRows; } @VisibleForTesting class RowLockContext { private final HashedBytes row; final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(true); final AtomicBoolean usable = new AtomicBoolean(true); final AtomicInteger count = new AtomicInteger(0); final Object lock = new Object(); private String threadName; RowLockContext(HashedBytes row) { this.row = row; } RowLockImpl newWriteLock() { Lock l = readWriteLock.writeLock(); return getRowLock(l); } RowLockImpl newReadLock() { Lock l = readWriteLock.readLock(); return getRowLock(l); } private RowLockImpl getRowLock(Lock l) { count.incrementAndGet(); synchronized (lock) { if (usable.get()) { return new RowLockImpl(this, l); } else { return null; } } } void cleanUp() { long c = count.decrementAndGet(); if (c <= 0) { synchronized (lock) { if (count.get() <= 0 && usable.get()){ // Don't attempt to remove row if already removed usable.set(false); RowLockContext removed = lockedRows.remove(row); assert removed == this: "we should never remove a different context"; } } } } public void setThreadName(String threadName) { this.threadName = threadName; } @Override public String toString() { return "RowLockContext{" + "row=" + row + ", readWriteLock=" + readWriteLock + ", count=" + count + ", threadName=" + threadName + '}'; } } /** * Class used to represent a lock on a row. */ public static class RowLockImpl implements RowLock { private final RowLockContext context; private final Lock lock; public RowLockImpl(RowLockContext context, Lock lock) { this.context = context; this.lock = lock; } public Lock getLock() { return lock; } @VisibleForTesting public RowLockContext getContext() { return context; } @Override public void release() { lock.unlock(); context.cleanUp(); } @Override public String toString() { return "RowLockImpl{" + "context=" + context + ", lock=" + lock + '}'; } } /** * Determines whether multiple column families are present * Precondition: familyPaths is not null * * @param familyPaths List of (column family, hfilePath) */ private static boolean hasMultipleColumnFamilies(Collection> familyPaths) { boolean multipleFamilies = false; byte[] family = null; for (Pair pair : familyPaths) { byte[] fam = pair.getFirst(); if (family == null) { family = fam; } else if (!Bytes.equals(family, fam)) { multipleFamilies = true; break; } } return multipleFamilies; } /** * Attempts to atomically load a group of hfiles. This is critical for loading * rows with multiple column families atomically. * * @param familyPaths List of Pair<byte[] column family, String hfilePath> * @param bulkLoadListener Internal hooks enabling massaging/preparation of a * file about to be bulk loaded * @param assignSeqId * @return Map from family to List of store file paths if successful, null if failed recoverably * @throws IOException if failed unrecoverably. */ public Map> bulkLoadHFiles(Collection> familyPaths, boolean assignSeqId, BulkLoadListener bulkLoadListener) throws IOException { return bulkLoadHFiles(familyPaths, assignSeqId, bulkLoadListener, false, null, true); } /** * Listener class to enable callers of * bulkLoadHFile() to perform any necessary * pre/post processing of a given bulkload call */ public interface BulkLoadListener { /** * Called before an HFile is actually loaded * @param family family being loaded to * @param srcPath path of HFile * @return final path to be used for actual loading * @throws IOException */ String prepareBulkLoad(byte[] family, String srcPath, boolean copyFile) throws IOException; /** * Called after a successful HFile load * @param family family being loaded to * @param srcPath path of HFile * @throws IOException */ void doneBulkLoad(byte[] family, String srcPath) throws IOException; /** * Called after a failed HFile load * @param family family being loaded to * @param srcPath path of HFile * @throws IOException */ void failedBulkLoad(byte[] family, String srcPath) throws IOException; } /** * Attempts to atomically load a group of hfiles. This is critical for loading * rows with multiple column families atomically. * * @param familyPaths List of Pair<byte[] column family, String hfilePath> * @param assignSeqId * @param bulkLoadListener Internal hooks enabling massaging/preparation of a * file about to be bulk loaded * @param copyFile always copy hfiles if true * @param clusterIds ids from clusters that had already handled the given bulkload event. * @return Map from family to List of store file paths if successful, null if failed recoverably * @throws IOException if failed unrecoverably. */ public Map> bulkLoadHFiles(Collection> familyPaths, boolean assignSeqId, BulkLoadListener bulkLoadListener, boolean copyFile, List clusterIds, boolean replicate) throws IOException { long seqId = -1; Map> storeFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); Map storeFilesSizes = new HashMap<>(); Preconditions.checkNotNull(familyPaths); // we need writeLock for multi-family bulk load startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths)); boolean isSuccessful = false; try { this.writeRequestsCount.increment(); // There possibly was a split that happened between when the split keys // were gathered and before the HRegion's write lock was taken. We need // to validate the HFile region before attempting to bulk load all of them IOException ioException = null; List> failures = new ArrayList<>(); for (Pair p : familyPaths) { byte[] familyName = p.getFirst(); String path = p.getSecond(); HStore store = getStore(familyName); if (store == null) { ioException = new org.apache.hadoop.hbase.DoNotRetryIOException( "No such column family " + Bytes.toStringBinary(familyName)); } else { try { store.assertBulkLoadHFileOk(new Path(path)); } catch (WrongRegionException wre) { // recoverable (file doesn't fit in region) failures.add(p); } catch (IOException ioe) { // unrecoverable (hdfs problem) ioException = ioe; } } // validation failed because of some sort of IO problem. if (ioException != null) { LOG.error("There was IO error when checking if the bulk load is ok in region {}.", this, ioException); throw ioException; } } // validation failed, bail out before doing anything permanent. if (failures.size() != 0) { StringBuilder list = new StringBuilder(); for (Pair p : failures) { list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ") .append(p.getSecond()); } // problem when validating LOG.warn("There was a recoverable bulk load failure likely due to a split. These (family," + " HFile) pairs were not loaded: {}, in region {}", list.toString(), this); return null; } // We need to assign a sequential ID that's in between two memstores in order to preserve // the guarantee that all the edits lower than the highest sequential ID from all the // HFiles are flushed on disk. See HBASE-10958. The sequence id returned when we flush is // guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is // a sequence id that we can be sure is beyond the last hfile written). if (assignSeqId) { FlushResult fs = flushcache(true, false, FlushLifeCycleTracker.DUMMY); if (fs.isFlushSucceeded()) { seqId = ((FlushResultImpl)fs).flushSequenceId; } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) { seqId = ((FlushResultImpl)fs).flushSequenceId; } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH) { // CANNOT_FLUSH may mean that a flush is already on-going // we need to wait for that flush to complete waitForFlushes(); } else { throw new IOException("Could not bulk load with an assigned sequential ID because the "+ "flush didn't run. Reason for not flushing: " + ((FlushResultImpl)fs).failureReason); } } Map>> familyWithFinalPath = new TreeMap<>(Bytes.BYTES_COMPARATOR); for (Pair p : familyPaths) { byte[] familyName = p.getFirst(); String path = p.getSecond(); HStore store = getStore(familyName); if (!familyWithFinalPath.containsKey(familyName)) { familyWithFinalPath.put(familyName, new ArrayList<>()); } List> lst = familyWithFinalPath.get(familyName); try { String finalPath = path; if (bulkLoadListener != null) { finalPath = bulkLoadListener.prepareBulkLoad(familyName, path, copyFile); } Pair pair = store.preBulkLoadHFile(finalPath, seqId); lst.add(pair); } catch (IOException ioe) { // A failure here can cause an atomicity violation that we currently // cannot recover from since it is likely a failed HDFS operation. LOG.error("There was a partial failure due to IO when attempting to" + " load " + Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe); if (bulkLoadListener != null) { try { bulkLoadListener.failedBulkLoad(familyName, path); } catch (Exception ex) { LOG.error("Error while calling failedBulkLoad for family " + Bytes.toString(familyName) + " with path " + path, ex); } } throw ioe; } } if (this.getCoprocessorHost() != null) { for (Map.Entry>> entry : familyWithFinalPath.entrySet()) { this.getCoprocessorHost().preCommitStoreFile(entry.getKey(), entry.getValue()); } } for (Map.Entry>> entry : familyWithFinalPath.entrySet()) { byte[] familyName = entry.getKey(); for (Pair p : entry.getValue()) { String path = p.getFirst().toString(); Path commitedStoreFile = p.getSecond(); HStore store = getStore(familyName); try { store.bulkLoadHFile(familyName, path, commitedStoreFile); // Note the size of the store file try { FileSystem fs = commitedStoreFile.getFileSystem(baseConf); storeFilesSizes.put(commitedStoreFile.getName(), fs.getFileStatus(commitedStoreFile) .getLen()); } catch (IOException e) { LOG.warn("Failed to find the size of hfile " + commitedStoreFile, e); storeFilesSizes.put(commitedStoreFile.getName(), 0L); } if(storeFiles.containsKey(familyName)) { storeFiles.get(familyName).add(commitedStoreFile); } else { List storeFileNames = new ArrayList<>(); storeFileNames.add(commitedStoreFile); storeFiles.put(familyName, storeFileNames); } if (bulkLoadListener != null) { bulkLoadListener.doneBulkLoad(familyName, path); } } catch (IOException ioe) { // A failure here can cause an atomicity violation that we currently // cannot recover from since it is likely a failed HDFS operation. // TODO Need a better story for reverting partial failures due to HDFS. LOG.error("There was a partial failure due to IO when attempting to" + " load " + Bytes.toString(familyName) + " : " + p.getSecond(), ioe); if (bulkLoadListener != null) { try { bulkLoadListener.failedBulkLoad(familyName, path); } catch (Exception ex) { LOG.error("Error while calling failedBulkLoad for family " + Bytes.toString(familyName) + " with path " + path, ex); } } throw ioe; } } } isSuccessful = true; } finally { if (wal != null && !storeFiles.isEmpty()) { // Write a bulk load event for hfiles that are loaded try { WALProtos.BulkLoadDescriptor loadDescriptor = ProtobufUtil.toBulkLoadDescriptor(this.getRegionInfo().getTable(), UnsafeByteOperations.unsafeWrap(this.getRegionInfo().getEncodedNameAsBytes()), storeFiles, storeFilesSizes, seqId, clusterIds, replicate); WALUtil.writeBulkLoadMarkerAndSync(this.wal, this.getReplicationScope(), getRegionInfo(), loadDescriptor, mvcc); } catch (IOException ioe) { if (this.rsServices != null) { // Have to abort region server because some hfiles has been loaded but we can't write // the event into WAL isSuccessful = false; this.rsServices.abort("Failed to write bulk load event into WAL.", ioe); } } } closeBulkRegionOperation(); } return isSuccessful ? storeFiles : null; } @Override public boolean equals(Object o) { return o instanceof HRegion && Bytes.equals(getRegionInfo().getRegionName(), ((HRegion) o).getRegionInfo().getRegionName()); } @Override public int hashCode() { return Bytes.hashCode(getRegionInfo().getRegionName()); } @Override public String toString() { return getRegionInfo().getRegionNameAsString(); } /** * RegionScannerImpl is used to combine scanners from multiple Stores (aka column families). */ class RegionScannerImpl implements RegionScanner, Shipper, org.apache.hadoop.hbase.ipc.RpcCallback { // Package local for testability KeyValueHeap storeHeap = null; /** Heap of key-values that are not essential for the provided filters and are thus read * on demand, if on-demand column family loading is enabled.*/ KeyValueHeap joinedHeap = null; /** * If the joined heap data gathering is interrupted due to scan limits, this will * contain the row for which we are populating the values.*/ protected Cell joinedContinuationRow = null; private boolean filterClosed = false; protected final byte[] stopRow; protected final boolean includeStopRow; protected final HRegion region; protected final CellComparator comparator; private final long readPt; private final long maxResultSize; private final ScannerContext defaultScannerContext; private final FilterWrapper filter; @Override public RegionInfo getRegionInfo() { return region.getRegionInfo(); } RegionScannerImpl(Scan scan, List additionalScanners, HRegion region) throws IOException { this(scan, additionalScanners, region, HConstants.NO_NONCE, HConstants.NO_NONCE); } RegionScannerImpl(Scan scan, List additionalScanners, HRegion region, long nonceGroup, long nonce) throws IOException { this.region = region; this.maxResultSize = scan.getMaxResultSize(); if (scan.hasFilter()) { this.filter = new FilterWrapper(scan.getFilter()); } else { this.filter = null; } this.comparator = region.getCellComparator(); /** * By default, calls to next/nextRaw must enforce the batch limit. Thus, construct a default * scanner context that can be used to enforce the batch limit in the event that a * ScannerContext is not specified during an invocation of next/nextRaw */ defaultScannerContext = ScannerContext.newBuilder() .setBatchLimit(scan.getBatch()).build(); this.stopRow = scan.getStopRow(); this.includeStopRow = scan.includeStopRow(); // synchronize on scannerReadPoints so that nobody calculates // getSmallestReadPoint, before scannerReadPoints is updated. IsolationLevel isolationLevel = scan.getIsolationLevel(); long mvccReadPoint = PackagePrivateFieldAccessor.getMvccReadPoint(scan); synchronized (scannerReadPoints) { if (mvccReadPoint > 0) { this.readPt = mvccReadPoint; } else if (nonce == HConstants.NO_NONCE || rsServices == null || rsServices.getNonceManager() == null) { this.readPt = getReadPoint(isolationLevel); } else { this.readPt = rsServices.getNonceManager().getMvccFromOperationContext(nonceGroup, nonce); } scannerReadPoints.put(this, this.readPt); } initializeScanners(scan, additionalScanners); } protected void initializeScanners(Scan scan, List additionalScanners) throws IOException { // Here we separate all scanners into two lists - scanner that provide data required // by the filter to operate (scanners list) and all others (joinedScanners list). List scanners = new ArrayList<>(scan.getFamilyMap().size()); List joinedScanners = new ArrayList<>(scan.getFamilyMap().size()); // Store all already instantiated scanners for exception handling List instantiatedScanners = new ArrayList<>(); // handle additionalScanners if (additionalScanners != null && !additionalScanners.isEmpty()) { scanners.addAll(additionalScanners); instantiatedScanners.addAll(additionalScanners); } try { for (Map.Entry> entry : scan.getFamilyMap().entrySet()) { HStore store = stores.get(entry.getKey()); KeyValueScanner scanner = store.getScanner(scan, entry.getValue(), this.readPt); instantiatedScanners.add(scanner); if (this.filter == null || !scan.doLoadColumnFamiliesOnDemand() || this.filter.isFamilyEssential(entry.getKey())) { scanners.add(scanner); } else { joinedScanners.add(scanner); } } initializeKVHeap(scanners, joinedScanners, region); } catch (Throwable t) { throw handleException(instantiatedScanners, t); } } protected void initializeKVHeap(List scanners, List joinedScanners, HRegion region) throws IOException { this.storeHeap = new KeyValueHeap(scanners, comparator); if (!joinedScanners.isEmpty()) { this.joinedHeap = new KeyValueHeap(joinedScanners, comparator); } } private IOException handleException(List instantiatedScanners, Throwable t) { // remove scaner read point before throw the exception scannerReadPoints.remove(this); if (storeHeap != null) { storeHeap.close(); storeHeap = null; if (joinedHeap != null) { joinedHeap.close(); joinedHeap = null; } } else { // close all already instantiated scanners before throwing the exception for (KeyValueScanner scanner : instantiatedScanners) { scanner.close(); } } return t instanceof IOException ? (IOException) t : new IOException(t); } @Override public long getMaxResultSize() { return maxResultSize; } @Override public long getMvccReadPoint() { return this.readPt; } @Override public int getBatch() { return this.defaultScannerContext.getBatchLimit(); } /** * Reset both the filter and the old filter. * * @throws IOException in case a filter raises an I/O exception. */ protected void resetFilters() throws IOException { if (filter != null) { filter.reset(); } } @Override public boolean next(List outResults) throws IOException { // apply the batching limit by default return next(outResults, defaultScannerContext); } @Override public synchronized boolean next(List outResults, ScannerContext scannerContext) throws IOException { if (this.filterClosed) { throw new UnknownScannerException("Scanner was closed (timed out?) " + "after we renewed it. Could be caused by a very slow scanner " + "or a lengthy garbage collection"); } startRegionOperation(Operation.SCAN); try { return nextRaw(outResults, scannerContext); } finally { closeRegionOperation(Operation.SCAN); } } @Override public boolean nextRaw(List outResults) throws IOException { // Use the RegionScanner's context by default return nextRaw(outResults, defaultScannerContext); } @Override public boolean nextRaw(List outResults, ScannerContext scannerContext) throws IOException { if (storeHeap == null) { // scanner is closed throw new UnknownScannerException("Scanner was closed"); } boolean moreValues = false; if (outResults.isEmpty()) { // Usually outResults is empty. This is true when next is called // to handle scan or get operation. moreValues = nextInternal(outResults, scannerContext); } else { List tmpList = new ArrayList<>(); moreValues = nextInternal(tmpList, scannerContext); outResults.addAll(tmpList); } readRequestsCount.increment(); if (metricsRegion != null) { metricsRegion.updateReadRequestCount(); } // If the size limit was reached it means a partial Result is being returned. Returning a // partial Result means that we should not reset the filters; filters should only be reset in // between rows if (!scannerContext.mayHaveMoreCellsInRow()) { resetFilters(); } if (isFilterDoneInternal()) { moreValues = false; } return moreValues; } /** * @return true if more cells exist after this batch, false if scanner is done */ private boolean populateFromJoinedHeap(List results, ScannerContext scannerContext) throws IOException { assert joinedContinuationRow != null; boolean moreValues = populateResult(results, this.joinedHeap, scannerContext, joinedContinuationRow); if (!scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) { // We are done with this row, reset the continuation. joinedContinuationRow = null; } // As the data is obtained from two independent heaps, we need to // ensure that result list is sorted, because Result relies on that. sort(results, comparator); return moreValues; } /** * Fetches records with currentRow into results list, until next row, batchLimit (if not -1) is * reached, or remainingResultSize (if not -1) is reaced * @param heap KeyValueHeap to fetch data from.It must be positioned on correct row before call. * @param scannerContext * @param currentRowCell * @return state of last call to {@link KeyValueHeap#next()} */ private boolean populateResult(List results, KeyValueHeap heap, ScannerContext scannerContext, Cell currentRowCell) throws IOException { Cell nextKv; boolean moreCellsInRow = false; boolean tmpKeepProgress = scannerContext.getKeepProgress(); // Scanning between column families and thus the scope is between cells LimitScope limitScope = LimitScope.BETWEEN_CELLS; do { // We want to maintain any progress that is made towards the limits while scanning across // different column families. To do this, we toggle the keep progress flag on during calls // to the StoreScanner to ensure that any progress made thus far is not wiped away. scannerContext.setKeepProgress(true); heap.next(results, scannerContext); scannerContext.setKeepProgress(tmpKeepProgress); nextKv = heap.peek(); moreCellsInRow = moreCellsInRow(nextKv, currentRowCell); if (!moreCellsInRow) incrementCountOfRowsScannedMetric(scannerContext); if (moreCellsInRow && scannerContext.checkBatchLimit(limitScope)) { return scannerContext.setScannerState(NextState.BATCH_LIMIT_REACHED).hasMoreValues(); } else if (scannerContext.checkSizeLimit(limitScope)) { ScannerContext.NextState state = moreCellsInRow ? NextState.SIZE_LIMIT_REACHED_MID_ROW : NextState.SIZE_LIMIT_REACHED; return scannerContext.setScannerState(state).hasMoreValues(); } else if (scannerContext.checkTimeLimit(limitScope)) { ScannerContext.NextState state = moreCellsInRow ? NextState.TIME_LIMIT_REACHED_MID_ROW : NextState.TIME_LIMIT_REACHED; return scannerContext.setScannerState(state).hasMoreValues(); } } while (moreCellsInRow); return nextKv != null; } /** * Based on the nextKv in the heap, and the current row, decide whether or not there are more * cells to be read in the heap. If the row of the nextKv in the heap matches the current row * then there are more cells to be read in the row. * @param nextKv * @param currentRowCell * @return true When there are more cells in the row to be read */ private boolean moreCellsInRow(final Cell nextKv, Cell currentRowCell) { return nextKv != null && CellUtil.matchingRows(nextKv, currentRowCell); } /* * @return True if a filter rules the scanner is over, done. */ @Override public synchronized boolean isFilterDone() throws IOException { return isFilterDoneInternal(); } private boolean isFilterDoneInternal() throws IOException { return this.filter != null && this.filter.filterAllRemaining(); } private boolean nextInternal(List results, ScannerContext scannerContext) throws IOException { if (!results.isEmpty()) { throw new IllegalArgumentException("First parameter should be an empty list"); } if (scannerContext == null) { throw new IllegalArgumentException("Scanner context cannot be null"); } Optional rpcCall = RpcServer.getCurrentCall(); // Save the initial progress from the Scanner context in these local variables. The progress // may need to be reset a few times if rows are being filtered out so we save the initial // progress. int initialBatchProgress = scannerContext.getBatchProgress(); long initialSizeProgress = scannerContext.getDataSizeProgress(); long initialHeapSizeProgress = scannerContext.getHeapSizeProgress(); // Used to check time limit LimitScope limitScope = LimitScope.BETWEEN_CELLS; // The loop here is used only when at some point during the next we determine // that due to effects of filters or otherwise, we have an empty row in the result. // Then we loop and try again. Otherwise, we must get out on the first iteration via return, // "true" if there's more data to read, "false" if there isn't (storeHeap is at a stop row, // and joinedHeap has no more data to read for the last row (if set, joinedContinuationRow). while (true) { // Starting to scan a new row. Reset the scanner progress according to whether or not // progress should be kept. if (scannerContext.getKeepProgress()) { // Progress should be kept. Reset to initial values seen at start of method invocation. scannerContext.setProgress(initialBatchProgress, initialSizeProgress, initialHeapSizeProgress); } else { scannerContext.clearProgress(); } if (rpcCall.isPresent()) { // If a user specifies a too-restrictive or too-slow scanner, the // client might time out and disconnect while the server side // is still processing the request. We should abort aggressively // in that case. long afterTime = rpcCall.get().disconnectSince(); if (afterTime >= 0) { throw new CallerDisconnectedException( "Aborting on region " + getRegionInfo().getRegionNameAsString() + ", call " + this + " after " + afterTime + " ms, since " + "caller disconnected"); } } // Let's see what we have in the storeHeap. Cell current = this.storeHeap.peek(); boolean shouldStop = shouldStop(current); // When has filter row is true it means that the all the cells for a particular row must be // read before a filtering decision can be made. This means that filters where hasFilterRow // run the risk of enLongAddering out of memory errors in the case that they are applied to a // table that has very large rows. boolean hasFilterRow = this.filter != null && this.filter.hasFilterRow(); // If filter#hasFilterRow is true, partial results are not allowed since allowing them // would prevent the filters from being evaluated. Thus, if it is true, change the // scope of any limits that could potentially create partial results to // LimitScope.BETWEEN_ROWS so that those limits are not reached mid-row if (hasFilterRow) { if (LOG.isTraceEnabled()) { LOG.trace("filter#hasFilterRow is true which prevents partial results from being " + " formed. Changing scope of limits that may create partials"); } scannerContext.setSizeLimitScope(LimitScope.BETWEEN_ROWS); scannerContext.setTimeLimitScope(LimitScope.BETWEEN_ROWS); limitScope = LimitScope.BETWEEN_ROWS; } if (scannerContext.checkTimeLimit(LimitScope.BETWEEN_CELLS)) { if (hasFilterRow) { throw new IncompatibleFilterException( "Filter whose hasFilterRow() returns true is incompatible with scans that must " + " stop mid-row because of a limit. ScannerContext:" + scannerContext); } return true; } // Check if we were getting data from the joinedHeap and hit the limit. // If not, then it's main path - getting results from storeHeap. if (joinedContinuationRow == null) { // First, check if we are at a stop row. If so, there are no more results. if (shouldStop) { if (hasFilterRow) { filter.filterRowCells(results); } return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues(); } // Check if rowkey filter wants to exclude this row. If so, loop to next. // Technically, if we hit limits before on this row, we don't need this call. if (filterRowKey(current)) { incrementCountOfRowsFilteredMetric(scannerContext); // early check, see HBASE-16296 if (isFilterDoneInternal()) { return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues(); } // Typically the count of rows scanned is incremented inside #populateResult. However, // here we are filtering a row based purely on its row key, preventing us from calling // #populateResult. Thus, perform the necessary increment here to rows scanned metric incrementCountOfRowsScannedMetric(scannerContext); boolean moreRows = nextRow(scannerContext, current); if (!moreRows) { return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues(); } results.clear(); // Read nothing as the rowkey was filtered, but still need to check time limit if (scannerContext.checkTimeLimit(limitScope)) { return true; } continue; } // Ok, we are good, let's try to get some results from the main heap. populateResult(results, this.storeHeap, scannerContext, current); if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) { if (hasFilterRow) { throw new IncompatibleFilterException( "Filter whose hasFilterRow() returns true is incompatible with scans that must " + " stop mid-row because of a limit. ScannerContext:" + scannerContext); } return true; } Cell nextKv = this.storeHeap.peek(); shouldStop = shouldStop(nextKv); // save that the row was empty before filters applied to it. final boolean isEmptyRow = results.isEmpty(); // We have the part of the row necessary for filtering (all of it, usually). // First filter with the filterRow(List). FilterWrapper.FilterRowRetCode ret = FilterWrapper.FilterRowRetCode.NOT_CALLED; if (hasFilterRow) { ret = filter.filterRowCellsWithRet(results); // We don't know how the results have changed after being filtered. Must set progress // according to contents of results now. if (scannerContext.getKeepProgress()) { scannerContext.setProgress(initialBatchProgress, initialSizeProgress, initialHeapSizeProgress); } else { scannerContext.clearProgress(); } scannerContext.incrementBatchProgress(results.size()); for (Cell cell : results) { scannerContext.incrementSizeProgress(PrivateCellUtil.estimatedSerializedSizeOf(cell), cell.heapSize()); } } if (isEmptyRow || ret == FilterWrapper.FilterRowRetCode.EXCLUDE || filterRow()) { incrementCountOfRowsFilteredMetric(scannerContext); results.clear(); boolean moreRows = nextRow(scannerContext, current); if (!moreRows) { return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues(); } // This row was totally filtered out, if this is NOT the last row, // we should continue on. Otherwise, nothing else to do. if (!shouldStop) { // Read nothing as the cells was filtered, but still need to check time limit if (scannerContext.checkTimeLimit(limitScope)) { return true; } continue; } return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues(); } // Ok, we are done with storeHeap for this row. // Now we may need to fetch additional, non-essential data into row. // These values are not needed for filter to work, so we postpone their // fetch to (possibly) reduce amount of data loads from disk. if (this.joinedHeap != null) { boolean mayHaveData = joinedHeapMayHaveData(current); if (mayHaveData) { joinedContinuationRow = current; populateFromJoinedHeap(results, scannerContext); if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) { return true; } } } } else { // Populating from the joined heap was stopped by limits, populate some more. populateFromJoinedHeap(results, scannerContext); if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) { return true; } } // We may have just called populateFromJoinedMap and hit the limits. If that is // the case, we need to call it again on the next next() invocation. if (joinedContinuationRow != null) { return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues(); } // Finally, we are done with both joinedHeap and storeHeap. // Double check to prevent empty rows from appearing in result. It could be // the case when SingleColumnValueExcludeFilter is used. if (results.isEmpty()) { incrementCountOfRowsFilteredMetric(scannerContext); boolean moreRows = nextRow(scannerContext, current); if (!moreRows) { return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues(); } if (!shouldStop) continue; } if (shouldStop) { return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues(); } else { return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues(); } } } protected void incrementCountOfRowsFilteredMetric(ScannerContext scannerContext) { filteredReadRequestsCount.increment(); if (metricsRegion != null) { metricsRegion.updateFilteredRecords(); } if (scannerContext == null || !scannerContext.isTrackingMetrics()) return; scannerContext.getMetrics().countOfRowsFiltered.incrementAndGet(); } protected void incrementCountOfRowsScannedMetric(ScannerContext scannerContext) { if (scannerContext == null || !scannerContext.isTrackingMetrics()) return; scannerContext.getMetrics().countOfRowsScanned.incrementAndGet(); } /** * @param currentRowCell * @return true when the joined heap may have data for the current row * @throws IOException */ private boolean joinedHeapMayHaveData(Cell currentRowCell) throws IOException { Cell nextJoinedKv = joinedHeap.peek(); boolean matchCurrentRow = nextJoinedKv != null && CellUtil.matchingRows(nextJoinedKv, currentRowCell); boolean matchAfterSeek = false; // If the next value in the joined heap does not match the current row, try to seek to the // correct row if (!matchCurrentRow) { Cell firstOnCurrentRow = PrivateCellUtil.createFirstOnRow(currentRowCell); boolean seekSuccessful = this.joinedHeap.requestSeek(firstOnCurrentRow, true, true); matchAfterSeek = seekSuccessful && joinedHeap.peek() != null && CellUtil.matchingRows(joinedHeap.peek(), currentRowCell); } return matchCurrentRow || matchAfterSeek; } /** * This function is to maintain backward compatibility for 0.94 filters. HBASE-6429 combines * both filterRow & filterRow({@code List kvs}) functions. While 0.94 code or older, * it may not implement hasFilterRow as HBase-6429 expects because 0.94 hasFilterRow() only * returns true when filterRow({@code List kvs}) is overridden not the filterRow(). * Therefore, the filterRow() will be skipped. */ private boolean filterRow() throws IOException { // when hasFilterRow returns true, filter.filterRow() will be called automatically inside // filterRowCells(List kvs) so we skip that scenario here. return filter != null && (!filter.hasFilterRow()) && filter.filterRow(); } private boolean filterRowKey(Cell current) throws IOException { return filter != null && filter.filterRowKey(current); } protected boolean nextRow(ScannerContext scannerContext, Cell curRowCell) throws IOException { assert this.joinedContinuationRow == null: "Trying to go to next row during joinedHeap read."; Cell next; while ((next = this.storeHeap.peek()) != null && CellUtil.matchingRows(next, curRowCell)) { this.storeHeap.next(MOCKED_LIST); } resetFilters(); // Calling the hook in CP which allows it to do a fast forward return this.region.getCoprocessorHost() == null || this.region.getCoprocessorHost() .postScannerFilterRow(this, curRowCell); } protected boolean shouldStop(Cell currentRowCell) { if (currentRowCell == null) { return true; } if (stopRow == null || Bytes.equals(stopRow, HConstants.EMPTY_END_ROW)) { return false; } int c = comparator.compareRows(currentRowCell, stopRow, 0, stopRow.length); return c > 0 || (c == 0 && !includeStopRow); } @Override public synchronized void close() { if (storeHeap != null) { storeHeap.close(); storeHeap = null; } if (joinedHeap != null) { joinedHeap.close(); joinedHeap = null; } // no need to synchronize here. scannerReadPoints.remove(this); this.filterClosed = true; } KeyValueHeap getStoreHeapForTesting() { return storeHeap; } @Override public synchronized boolean reseek(byte[] row) throws IOException { if (row == null) { throw new IllegalArgumentException("Row cannot be null."); } boolean result = false; startRegionOperation(); Cell kv = PrivateCellUtil.createFirstOnRow(row, 0, (short) row.length); try { // use request seek to make use of the lazy seek option. See HBASE-5520 result = this.storeHeap.requestSeek(kv, true, true); if (this.joinedHeap != null) { result = this.joinedHeap.requestSeek(kv, true, true) || result; } } finally { closeRegionOperation(); } return result; } @Override public void shipped() throws IOException { if (storeHeap != null) { storeHeap.shipped(); } if (joinedHeap != null) { joinedHeap.shipped(); } } @Override public void run() throws IOException { // This is the RPC callback method executed. We do the close in of the scanner in this // callback this.close(); } } // Utility methods /** * A utility method to create new instances of HRegion based on the * {@link HConstants#REGION_IMPL} configuration property. * @param tableDir qualified path of directory where region should be located, * usually the table directory. * @param wal The WAL is the outbound log for any updates to the HRegion * The wal file is a logfile from the previous execution that's * custom-computed for this HRegion. The HRegionServer computes and sorts the * appropriate wal info for this HRegion. If there is a previous file * (implying that the HRegion has been written-to before), then read it from * the supplied path. * @param fs is the filesystem. * @param conf is global configuration settings. * @param regionInfo - RegionInfo that describes the region * is new), then read them from the supplied path. * @param htd the table descriptor * @return the new instance */ public static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs, Configuration conf, RegionInfo regionInfo, final TableDescriptor htd, RegionServerServices rsServices) { try { @SuppressWarnings("unchecked") Class regionClass = (Class) conf.getClass(HConstants.REGION_IMPL, HRegion.class); Constructor c = regionClass.getConstructor(Path.class, WAL.class, FileSystem.class, Configuration.class, RegionInfo.class, TableDescriptor.class, RegionServerServices.class); return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices); } catch (Throwable e) { // todo: what should I throw here? throw new IllegalStateException("Could not instantiate a region instance.", e); } } /** * Convenience method creating new HRegions. Used by createTable. * * @param info Info for region to create. * @param rootDir Root directory for HBase instance * @param wal shared WAL * @param initialize - true to initialize the region * @return new HRegion */ public static HRegion createHRegion(final RegionInfo info, final Path rootDir, final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal, final boolean initialize) throws IOException { LOG.info("creating " + info + ", tableDescriptor=" + (hTableDescriptor == null ? "null" : hTableDescriptor) + ", regionDir=" + rootDir); createRegionDir(conf, info, rootDir); FileSystem fs = rootDir.getFileSystem(conf); Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); HRegion region = HRegion.newHRegion(tableDir, wal, fs, conf, info, hTableDescriptor, null); if (initialize) { region.initialize(null); } return region; } /** * Create a region under the given table directory. */ public static HRegion createHRegion(Configuration conf, RegionInfo regionInfo, FileSystem fs, Path tableDir, TableDescriptor tableDesc) throws IOException { LOG.info("Creating {}, tableDescriptor={}, under table dir {}", regionInfo, tableDesc, tableDir); HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, regionInfo); HRegion region = HRegion.newHRegion(tableDir, null, fs, conf, regionInfo, tableDesc, null); return region; } /** * Create the region directory in the filesystem. */ public static HRegionFileSystem createRegionDir(Configuration configuration, RegionInfo ri, Path rootDir) throws IOException { FileSystem fs = rootDir.getFileSystem(configuration); Path tableDir = CommonFSUtils.getTableDir(rootDir, ri.getTable()); // If directory already exists, will log warning and keep going. Will try to create // .regioninfo. If one exists, will overwrite. return HRegionFileSystem.createRegionOnFileSystem(configuration, fs, tableDir, ri); } public static HRegion createHRegion(final RegionInfo info, final Path rootDir, final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal) throws IOException { return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true); } /** * Open a Region. * @param info Info for region to be opened. * @param wal WAL for region to use. This method will call * WAL#setSequenceNumber(long) passing the result of the call to * HRegion#getMinSequenceId() to ensure the wal id is properly kept * up. HRegionStore does this every time it opens a new region. * @return new HRegion * * @throws IOException */ public static HRegion openHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal, final Configuration conf) throws IOException { return openHRegion(info, htd, wal, conf, null, null); } /** * Open a Region. * @param info Info for region to be opened * @param htd the table descriptor * @param wal WAL for region to use. This method will call * WAL#setSequenceNumber(long) passing the result of the call to * HRegion#getMinSequenceId() to ensure the wal id is properly kept * up. HRegionStore does this every time it opens a new region. * @param conf The Configuration object to use. * @param rsServices An interface we can request flushes against. * @param reporter An interface we can report progress against. * @return new HRegion * * @throws IOException */ public static HRegion openHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal, final Configuration conf, final RegionServerServices rsServices, final CancelableProgressable reporter) throws IOException { return openHRegion(CommonFSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter); } /** * Open a Region. * @param rootDir Root directory for HBase instance * @param info Info for region to be opened. * @param htd the table descriptor * @param wal WAL for region to use. This method will call * WAL#setSequenceNumber(long) passing the result of the call to * HRegion#getMinSequenceId() to ensure the wal id is properly kept * up. HRegionStore does this every time it opens a new region. * @param conf The Configuration object to use. * @return new HRegion * @throws IOException */ public static HRegion openHRegion(Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, final Configuration conf) throws IOException { return openHRegion(rootDir, info, htd, wal, conf, null, null); } /** * Open a Region. * @param rootDir Root directory for HBase instance * @param info Info for region to be opened. * @param htd the table descriptor * @param wal WAL for region to use. This method will call * WAL#setSequenceNumber(long) passing the result of the call to * HRegion#getMinSequenceId() to ensure the wal id is properly kept * up. HRegionStore does this every time it opens a new region. * @param conf The Configuration object to use. * @param rsServices An interface we can request flushes against. * @param reporter An interface we can report progress against. * @return new HRegion * @throws IOException */ public static HRegion openHRegion(final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, final Configuration conf, final RegionServerServices rsServices, final CancelableProgressable reporter) throws IOException { FileSystem fs = null; if (rsServices != null) { fs = rsServices.getFileSystem(); } if (fs == null) { fs = rootDir.getFileSystem(conf); } return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter); } /** * Open a Region. * @param conf The Configuration object to use. * @param fs Filesystem to use * @param rootDir Root directory for HBase instance * @param info Info for region to be opened. * @param htd the table descriptor * @param wal WAL for region to use. This method will call * WAL#setSequenceNumber(long) passing the result of the call to * HRegion#getMinSequenceId() to ensure the wal id is properly kept * up. HRegionStore does this every time it opens a new region. * @return new HRegion */ public static HRegion openHRegion(final Configuration conf, final FileSystem fs, final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal) throws IOException { return openHRegion(conf, fs, rootDir, info, htd, wal, null, null); } /** * Open a Region. * @param conf The Configuration object to use. * @param fs Filesystem to use * @param rootDir Root directory for HBase instance * @param info Info for region to be opened. * @param htd the table descriptor * @param wal WAL for region to use. This method will call * WAL#setSequenceNumber(long) passing the result of the call to * HRegion#getMinSequenceId() to ensure the wal id is properly kept * up. HRegionStore does this every time it opens a new region. * @param rsServices An interface we can request flushes against. * @param reporter An interface we can report progress against. * @return new HRegion */ public static HRegion openHRegion(final Configuration conf, final FileSystem fs, final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, final RegionServerServices rsServices, final CancelableProgressable reporter) throws IOException { Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); return openHRegionFromTableDir(conf, fs, tableDir, info, htd, wal, rsServices, reporter); } /** * Open a Region. * @param conf The Configuration object to use. * @param fs Filesystem to use * @param info Info for region to be opened. * @param htd the table descriptor * @param wal WAL for region to use. This method will call * WAL#setSequenceNumber(long) passing the result of the call to * HRegion#getMinSequenceId() to ensure the wal id is properly kept * up. HRegionStore does this every time it opens a new region. * @param rsServices An interface we can request flushes against. * @param reporter An interface we can report progress against. * @return new HRegion */ public static HRegion openHRegionFromTableDir(final Configuration conf, final FileSystem fs, final Path tableDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, final RegionServerServices rsServices, final CancelableProgressable reporter) throws IOException { Objects.requireNonNull(info, "RegionInfo cannot be null"); LOG.debug("Opening region: {}", info); HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices); return r.openHRegion(reporter); } @VisibleForTesting public NavigableMap getReplicationScope() { return this.replicationScope; } /** * Useful when reopening a closed region (normally for unit tests) * @param other original object * @param reporter An interface we can report progress against. * @return new HRegion */ public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter) throws IOException { HRegionFileSystem regionFs = other.getRegionFileSystem(); HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(), other.baseConf, other.getRegionInfo(), other.getTableDescriptor(), null); return r.openHRegion(reporter); } public static Region openHRegion(final Region other, final CancelableProgressable reporter) throws IOException { return openHRegion((HRegion)other, reporter); } /** * Open HRegion. * Calls initialize and sets sequenceId. * @return Returns this */ protected HRegion openHRegion(final CancelableProgressable reporter) throws IOException { try { // Refuse to open the region if we are missing local compression support TableDescriptorChecker.checkCompression(htableDescriptor); // Refuse to open the region if encryption configuration is incorrect or // codec support is missing TableDescriptorChecker.checkEncryption(conf, htableDescriptor); // Refuse to open the region if a required class cannot be loaded TableDescriptorChecker.checkClassLoading(conf, htableDescriptor); this.openSeqNum = initialize(reporter); this.mvcc.advanceTo(openSeqNum); // The openSeqNum must be increased every time when a region is assigned, as we rely on it to // determine whether a region has been successfully reopened. So here we always write open // marker, even if the table is read only. if (wal != null && getRegionServerServices() != null && RegionReplicaUtil.isDefaultReplica(getRegionInfo())) { writeRegionOpenMarker(wal, openSeqNum); } } catch (Throwable t) { // By coprocessor path wrong region will open failed, // MetricsRegionWrapperImpl is already init and not close, // add region close when open failed try { // It is not required to write sequence id file when region open is failed. // Passing true to skip the sequence id file write. this.close(true); } catch (Throwable e) { LOG.warn("Open region: {} failed. Try close region but got exception ", this.getRegionInfo(), e); } throw t; } return this; } /** * Open a Region on a read-only file-system (like hdfs snapshots) * @param conf The Configuration object to use. * @param fs Filesystem to use * @param info Info for region to be opened. * @param htd the table descriptor * @return new HRegion */ public static HRegion openReadOnlyFileSystemHRegion(final Configuration conf, final FileSystem fs, final Path tableDir, RegionInfo info, final TableDescriptor htd) throws IOException { if (info == null) { throw new NullPointerException("Passed region info is null"); } if (LOG.isDebugEnabled()) { LOG.debug("Opening region (readOnly filesystem): " + info); } if (info.getReplicaId() <= 0) { info = RegionInfoBuilder.newBuilder(info).setReplicaId(1).build(); } HRegion r = HRegion.newHRegion(tableDir, null, fs, conf, info, htd, null); r.writestate.setReadOnly(true); return r.openHRegion(null); } public static void warmupHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal, final Configuration conf, final RegionServerServices rsServices, final CancelableProgressable reporter) throws IOException { if (info == null) throw new NullPointerException("Passed region info is null"); if (LOG.isDebugEnabled()) { LOG.debug("HRegion.Warming up region: " + info); } Path rootDir = CommonFSUtils.getRootDir(conf); Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); FileSystem fs = null; if (rsServices != null) { fs = rsServices.getFileSystem(); } if (fs == null) { fs = rootDir.getFileSystem(conf); } HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, null); r.initializeWarmup(reporter); } /** * Computes the Path of the HRegion * * @param tabledir qualified path for table * @param name ENCODED region name * @return Path of HRegion directory * @deprecated For tests only; to be removed. */ @Deprecated public static Path getRegionDir(final Path tabledir, final String name) { return new Path(tabledir, name); } /** * Determines if the specified row is within the row range specified by the * specified RegionInfo * * @param info RegionInfo that specifies the row range * @param row row to be checked * @return true if the row is within the range specified by the RegionInfo */ public static boolean rowIsInRange(RegionInfo info, final byte [] row) { return ((info.getStartKey().length == 0) || (Bytes.compareTo(info.getStartKey(), row) <= 0)) && ((info.getEndKey().length == 0) || (Bytes.compareTo(info.getEndKey(), row) > 0)); } public static boolean rowIsInRange(RegionInfo info, final byte [] row, final int offset, final short length) { return ((info.getStartKey().length == 0) || (Bytes.compareTo(info.getStartKey(), 0, info.getStartKey().length, row, offset, length) <= 0)) && ((info.getEndKey().length == 0) || (Bytes.compareTo(info.getEndKey(), 0, info.getEndKey().length, row, offset, length) > 0)); } @Override public Result get(final Get get) throws IOException { prepareGet(get); List results = get(get, true); boolean stale = this.getRegionInfo().getReplicaId() != 0; return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale); } void prepareGet(final Get get) throws IOException { checkRow(get.getRow(), "Get"); // Verify families are all valid if (get.hasFamilies()) { for (byte[] family : get.familySet()) { checkFamily(family); } } else { // Adding all families to scanner for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { get.addFamily(family); } } } @Override public List get(Get get, boolean withCoprocessor) throws IOException { return get(get, withCoprocessor, HConstants.NO_NONCE, HConstants.NO_NONCE); } public List get(Get get, boolean withCoprocessor, long nonceGroup, long nonce) throws IOException { List results = new ArrayList<>(); long before = EnvironmentEdgeManager.currentTime(); // pre-get CP hook if (withCoprocessor && (coprocessorHost != null)) { if (coprocessorHost.preGet(get, results)) { metricsUpdateForGet(results, before); return results; } } Scan scan = new Scan(get); if (scan.getLoadColumnFamiliesOnDemandValue() == null) { scan.setLoadColumnFamiliesOnDemand(isLoadingCfsOnDemandDefault()); } RegionScanner scanner = null; try { scanner = getScanner(scan, null, nonceGroup, nonce); scanner.next(results); } finally { if (scanner != null) scanner.close(); } // post-get CP hook if (withCoprocessor && (coprocessorHost != null)) { coprocessorHost.postGet(get, results); } metricsUpdateForGet(results, before); return results; } void metricsUpdateForGet(List results, long before) { if (this.metricsRegion != null) { this.metricsRegion.updateGet(EnvironmentEdgeManager.currentTime() - before); } if (rsServices != null && this.rsServices.getMetrics() != null) { rsServices.getMetrics().updateReadQueryMeter(getTableDescriptor().getTableName(), 1); } } @Override public void mutateRow(RowMutations rm) throws IOException { // Don't need nonces here - RowMutations only supports puts and deletes final List m = rm.getMutations(); batchMutate(m.toArray(new Mutation[m.size()]), true, HConstants.NO_NONCE, HConstants.NO_NONCE); } /** * Perform atomic (all or none) mutations within the region. * @param mutations The list of mutations to perform. * mutations can contain operations for multiple rows. * Caller has to ensure that all rows are contained in this region. * @param rowsToLock Rows to lock * @param nonceGroup Optional nonce group of the operation (client Id) * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence") * If multiple rows are locked care should be taken that * rowsToLock is sorted in order to avoid deadlocks. * @throws IOException */ @Override public void mutateRowsWithLocks(Collection mutations, Collection rowsToLock, long nonceGroup, long nonce) throws IOException { batchMutate(new MutationBatchOperation(this, mutations.toArray(new Mutation[mutations.size()]), true, nonceGroup, nonce) { @Override public MiniBatchOperationInProgress lockRowsAndBuildMiniBatch( List acquiredRowLocks) throws IOException { RowLock prevRowLock = null; for (byte[] row : rowsToLock) { try { RowLock rowLock = region.getRowLockInternal(row, false, prevRowLock); // write lock if (rowLock != prevRowLock) { acquiredRowLocks.add(rowLock); prevRowLock = rowLock; } } catch (IOException ioe) { LOG.warn("Failed getting lock, row={}, in region {}", Bytes.toStringBinary(row), this, ioe); throw ioe; } } return createMiniBatch(size(), size()); } }); } /** * @return statistics about the current load of the region */ public ClientProtos.RegionLoadStats getLoadStatistics() { if (!regionStatsEnabled) { return null; } ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder(); stats.setMemStoreLoad((int) (Math.min(100, (this.memStoreSizing.getMemStoreSize().getHeapSize() * 100) / this.memstoreFlushSize))); if (rsServices.getHeapMemoryManager() != null) { // the HeapMemoryManager uses -0.0 to signal a problem asking the JVM, // so we could just do the calculation below and we'll get a 0. // treating it as a special case analogous to no HMM instead so that it can be // programatically treated different from using <1% of heap. final float occupancy = rsServices.getHeapMemoryManager().getHeapOccupancyPercent(); if (occupancy != HeapMemoryManager.HEAP_OCCUPANCY_ERROR_VALUE) { stats.setHeapOccupancy((int)(occupancy * 100)); } } stats.setCompactionPressure((int) (rsServices.getCompactionPressure() * 100 > 100 ? 100 : rsServices.getCompactionPressure() * 100)); return stats.build(); } @Override public void processRowsWithLocks(RowProcessor processor) throws IOException { processRowsWithLocks(processor, rowProcessorTimeout, HConstants.NO_NONCE, HConstants.NO_NONCE); } @Override public void processRowsWithLocks(RowProcessor processor, long nonceGroup, long nonce) throws IOException { processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce); } @Override public void processRowsWithLocks(RowProcessor processor, long timeout, long nonceGroup, long nonce) throws IOException { for (byte[] row : processor.getRowsToLock()) { checkRow(row, "processRowsWithLocks"); } if (!processor.readOnly()) { checkReadOnly(); } checkResources(); startRegionOperation(); WALEdit walEdit = new WALEdit(); // STEP 1. Run pre-process hook preProcess(processor, walEdit); // Short circuit the read only case if (processor.readOnly()) { try { long now = EnvironmentEdgeManager.currentTime(); doProcessRowWithTimeout(processor, now, this, null, null, timeout); processor.postProcess(this, walEdit, true); } finally { closeRegionOperation(); } return; } boolean locked = false; List acquiredRowLocks = null; List mutations = new ArrayList<>(); Collection rowsToLock = processor.getRowsToLock(); // This is assigned by mvcc either explicity in the below or in the guts of the WAL append // when it assigns the edit a sequencedid (A.K.A the mvcc write number). WriteEntry writeEntry = null; MemStoreSizing memstoreAccounting = new NonThreadSafeMemStoreSizing(); try { boolean success = false; try { // STEP 2. Acquire the row lock(s) acquiredRowLocks = new ArrayList<>(rowsToLock.size()); RowLock prevRowLock = null; for (byte[] row : rowsToLock) { // Attempt to lock all involved rows, throw if any lock times out // use a writer lock for mixed reads and writes RowLock rowLock = getRowLockInternal(row, false, prevRowLock); if (rowLock != prevRowLock) { acquiredRowLocks.add(rowLock); prevRowLock = rowLock; } } // STEP 3. Region lock lock(this.updatesLock.readLock(), acquiredRowLocks.isEmpty() ? 1 : acquiredRowLocks.size()); locked = true; long now = EnvironmentEdgeManager.currentTime(); // STEP 4. Let the processor scan the rows, generate mutations and add waledits doProcessRowWithTimeout(processor, now, this, mutations, walEdit, timeout); if (!mutations.isEmpty()) { writeRequestsCount.add(mutations.size()); // STEP 5. Call the preBatchMutate hook processor.preBatchMutate(this, walEdit); // STEP 6. Append and sync if walEdit has data to write out. if (!walEdit.isEmpty()) { writeEntry = doWALAppend(walEdit, getEffectiveDurability(processor.useDurability()), processor.getClusterIds(), now, nonceGroup, nonce); } else { // We are here if WAL is being skipped. writeEntry = this.mvcc.begin(); } // STEP 7. Apply to memstore long sequenceId = writeEntry.getWriteNumber(); for (Mutation m : mutations) { // Handle any tag based cell features. // TODO: Do we need to call rewriteCellTags down in applyToMemStore()? Why not before // so tags go into WAL? rewriteCellTags(m.getFamilyCellMap(), m); for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) { Cell cell = cellScanner.current(); if (walEdit.isEmpty()) { // If walEdit is empty, we put nothing in WAL. WAL stamps Cells with sequence id. // If no WAL, need to stamp it here. PrivateCellUtil.setSequenceId(cell, sequenceId); } applyToMemStore(getStore(cell), cell, memstoreAccounting); } } // STEP 8. call postBatchMutate hook processor.postBatchMutate(this); // STEP 9. Complete mvcc. mvcc.completeAndWait(writeEntry); writeEntry = null; // STEP 10. Release region lock if (locked) { this.updatesLock.readLock().unlock(); locked = false; } // STEP 11. Release row lock(s) releaseRowLocks(acquiredRowLocks); if (rsServices != null && rsServices.getMetrics() != null) { rsServices.getMetrics().updateWriteQueryMeter(this.htableDescriptor. getTableName(), mutations.size()); } } success = true; } finally { // Call complete rather than completeAndWait because we probably had error if walKey != null if (writeEntry != null) mvcc.complete(writeEntry); if (locked) { this.updatesLock.readLock().unlock(); } // release locks if some were acquired but another timed out releaseRowLocks(acquiredRowLocks); } // 12. Run post-process hook processor.postProcess(this, walEdit, success); } finally { closeRegionOperation(); if (!mutations.isEmpty()) { this.incMemStoreSize(memstoreAccounting.getMemStoreSize()); requestFlushIfNeeded(); } } } private void preProcess(final RowProcessor processor, final WALEdit walEdit) throws IOException { try { processor.preProcess(this, walEdit); } catch (IOException e) { closeRegionOperation(); throw e; } } private void doProcessRowWithTimeout(final RowProcessor processor, final long now, final HRegion region, final List mutations, final WALEdit walEdit, final long timeout) throws IOException { // Short circuit the no time bound case. if (timeout < 0) { try { processor.process(now, region, mutations, walEdit); } catch (IOException e) { String row = processor.getRowsToLock().isEmpty() ? "" : " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "..."; LOG.warn("RowProcessor: {}, in region {}, throws Exception {}", processor.getClass().getName(), getRegionInfo().getRegionNameAsString(), row, e); throw e; } return; } // Case with time bound FutureTask task = new FutureTask<>(new Callable() { @Override public Void call() throws IOException { try { processor.process(now, region, mutations, walEdit); return null; } catch (IOException e) { String row = processor.getRowsToLock().isEmpty() ? "" : " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "..."; LOG.warn("RowProcessor: {}, in region {}, throws Exception {}", processor.getClass().getName(), getRegionInfo().getRegionNameAsString(), row, e); throw e; } } }); rowProcessorExecutor.execute(task); try { task.get(timeout, TimeUnit.MILLISECONDS); } catch (TimeoutException te) { String row = processor.getRowsToLock().isEmpty() ? "" : " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "..."; LOG.error("RowProcessor timeout: {} ms, in region {}, {}", timeout, getRegionInfo().getRegionNameAsString(), row); throw new IOException(te); } catch (Exception e) { throw new IOException(e); } } @Override public Result append(Append append) throws IOException { return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE); } public Result append(Append mutation, long nonceGroup, long nonce) throws IOException { return doDelta(Operation.APPEND, mutation, nonceGroup, nonce, mutation.isReturnResults()); } @Override public Result increment(Increment increment) throws IOException { return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE); } public Result increment(Increment mutation, long nonceGroup, long nonce) throws IOException { return doDelta(Operation.INCREMENT, mutation, nonceGroup, nonce, mutation.isReturnResults()); } /** * Add "deltas" to Cells. Deltas are increments or appends. Switch on op. * *

If increment, add deltas to current values or if an append, then * append the deltas to the current Cell values. * *

Append and Increment code paths are mostly the same. They differ in just a few places. * This method does the code path for increment and append and then in key spots, switches * on the passed in op to do increment or append specific paths. */ private Result doDelta(Operation op, Mutation mutation, long nonceGroup, long nonce, boolean returnResults) throws IOException { checkReadOnly(); checkResources(); checkRow(mutation.getRow(), op.toString()); checkFamilies(mutation.getFamilyCellMap().keySet()); this.writeRequestsCount.increment(); WriteEntry writeEntry = null; startRegionOperation(op); List results = returnResults? new ArrayList<>(mutation.size()): null; RowLock rowLock = null; MemStoreSizing memstoreAccounting = new NonThreadSafeMemStoreSizing(); try { rowLock = getRowLockInternal(mutation.getRow(), false, null); lock(this.updatesLock.readLock()); try { Result cpResult = doCoprocessorPreCall(op, mutation); if (cpResult != null) { // Metrics updated below in the finally block. return returnResults? cpResult: null; } Durability effectiveDurability = getEffectiveDurability(mutation.getDurability()); Map> forMemStore = new HashMap<>(mutation.getFamilyCellMap().size()); // Reckon Cells to apply to WAL -- in returned walEdit -- and what to add to memstore and // what to return back to the client (in 'forMemStore' and 'results' respectively). WALEdit walEdit = reckonDeltas(op, mutation, effectiveDurability, forMemStore, results); // Actually write to WAL now if a walEdit to apply. if (walEdit != null && !walEdit.isEmpty()) { writeEntry = doWALAppend(walEdit, effectiveDurability, nonceGroup, nonce); } else { // If walEdits is empty, it means we skipped the WAL; update LongAdders and start an mvcc // transaction. recordMutationWithoutWal(mutation.getFamilyCellMap()); writeEntry = mvcc.begin(); updateSequenceId(forMemStore.values(), writeEntry.getWriteNumber()); } // Now write to MemStore. Do it a column family at a time. for (Map.Entry> e : forMemStore.entrySet()) { applyToMemStore(e.getKey(), e.getValue(), true, memstoreAccounting); } mvcc.completeAndWait(writeEntry); if (rsServices != null && rsServices.getNonceManager() != null) { rsServices.getNonceManager().addMvccToOperationContext(nonceGroup, nonce, writeEntry.getWriteNumber()); } if (rsServices != null && rsServices.getMetrics() != null) { rsServices.getMetrics().updateWriteQueryMeter(this.htableDescriptor. getTableName()); } writeEntry = null; } finally { this.updatesLock.readLock().unlock(); } // If results is null, then client asked that we not return the calculated results. return results != null && returnResults? Result.create(results): Result.EMPTY_RESULT; } finally { // Call complete always, even on success. doDelta is doing a Get READ_UNCOMMITTED when it goes // to get current value under an exclusive lock so no need so no need to wait to return to // the client. Means only way to read-your-own-increment or append is to come in with an // a 0 increment. if (writeEntry != null) mvcc.complete(writeEntry); if (rowLock != null) { rowLock.release(); } // Request a cache flush if over the limit. Do it outside update lock. incMemStoreSize(memstoreAccounting.getMemStoreSize()); requestFlushIfNeeded(); closeRegionOperation(op); if (this.metricsRegion != null) { switch (op) { case INCREMENT: this.metricsRegion.updateIncrement(); break; case APPEND: this.metricsRegion.updateAppend(); break; default: break; } } } } private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, long nonceGroup, long nonce) throws IOException { return doWALAppend(walEdit, durability, WALKey.EMPTY_UUIDS, System.currentTimeMillis(), nonceGroup, nonce); } private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List clusterIds, long now, long nonceGroup, long nonce) throws IOException { return doWALAppend(walEdit, durability, clusterIds, now, nonceGroup, nonce, SequenceId.NO_SEQUENCE_ID); } /** * @return writeEntry associated with this append */ private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List clusterIds, long now, long nonceGroup, long nonce, long origLogSeqNum) throws IOException { Preconditions.checkArgument(walEdit != null && !walEdit.isEmpty(), "WALEdit is null or empty!"); Preconditions.checkArgument(!walEdit.isReplay() || origLogSeqNum != SequenceId.NO_SEQUENCE_ID, "Invalid replay sequence Id for replay WALEdit!"); // Using default cluster id, as this can only happen in the originating cluster. // A slave cluster receives the final value (not the delta) as a Put. We use HLogKey // here instead of WALKeyImpl directly to support legacy coprocessors. WALKeyImpl walKey = walEdit.isReplay()? new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(), this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds, nonceGroup, nonce, mvcc) : new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(), this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds, nonceGroup, nonce, mvcc, this.getReplicationScope()); if (walEdit.isReplay()) { walKey.setOrigLogSeqNum(origLogSeqNum); } //don't call the coproc hook for writes to the WAL caused by //system lifecycle events like flushes or compactions if (this.coprocessorHost != null && !walEdit.isMetaEdit()) { this.coprocessorHost.preWALAppend(walKey, walEdit); } WriteEntry writeEntry = null; try { long txid = this.wal.appendData(this.getRegionInfo(), walKey, walEdit); // Call sync on our edit. if (txid != 0) { sync(txid, durability); } writeEntry = walKey.getWriteEntry(); } catch (IOException ioe) { if (walKey != null && walKey.getWriteEntry() != null) { mvcc.complete(walKey.getWriteEntry()); } throw ioe; } return writeEntry; } /** * Do coprocessor pre-increment or pre-append call. * @return Result returned out of the coprocessor, which means bypass all further processing and * return the proffered Result instead, or null which means proceed. */ private Result doCoprocessorPreCall(final Operation op, final Mutation mutation) throws IOException { Result result = null; if (this.coprocessorHost != null) { switch(op) { case INCREMENT: result = this.coprocessorHost.preIncrementAfterRowLock((Increment)mutation); break; case APPEND: result = this.coprocessorHost.preAppendAfterRowLock((Append)mutation); break; default: throw new UnsupportedOperationException(op.toString()); } } return result; } /** * Reckon the Cells to apply to WAL, memstore, and to return to the Client; these Sets are not * always the same dependent on whether to write WAL. * * @param results Fill in here what goes back to the Client if it is non-null (if null, client * doesn't want results). * @param forMemStore Fill in here what to apply to the MemStore (by Store). * @return A WALEdit to apply to WAL or null if we are to skip the WAL. */ private WALEdit reckonDeltas(Operation op, Mutation mutation, Durability effectiveDurability, Map> forMemStore, List results) throws IOException { WALEdit walEdit = null; long now = EnvironmentEdgeManager.currentTime(); final boolean writeToWAL = effectiveDurability != Durability.SKIP_WAL; // Process a Store/family at a time. for (Map.Entry> entry: mutation.getFamilyCellMap().entrySet()) { final byte[] columnFamilyName = entry.getKey(); List deltas = entry.getValue(); // Reckon for the Store what to apply to WAL and MemStore. List toApply = reckonDeltasByStore(stores.get(columnFamilyName), op, mutation, effectiveDurability, now, deltas, results); if (!toApply.isEmpty()) { for (Cell cell : toApply) { HStore store = getStore(cell); if (store == null) { checkFamily(CellUtil.cloneFamily(cell)); } else { forMemStore.computeIfAbsent(store, key -> new ArrayList<>()).add(cell); } } if (writeToWAL) { if (walEdit == null) { walEdit = new WALEdit(); } walEdit.getCells().addAll(toApply); } } } return walEdit; } /** * Reckon the Cells to apply to WAL, memstore, and to return to the Client in passed * column family/Store. * * Does Get of current value and then adds passed in deltas for this Store returning the result. * * @param op Whether Increment or Append * @param mutation The encompassing Mutation object * @param deltas Changes to apply to this Store; either increment amount or data to append * @param results In here we accumulate all the Cells we are to return to the client. If null, * client doesn't want results returned. * @return Resulting Cells after deltas have been applied to current * values. Side effect is our filling out of the results List. */ private List reckonDeltasByStore(HStore store, Operation op, Mutation mutation, Durability effectiveDurability, long now, List deltas, List results) throws IOException { byte[] columnFamily = store.getColumnFamilyDescriptor().getName(); List> cellPairs = new ArrayList<>(deltas.size()); // Get previous values for all columns in this family. TimeRange tr = null; switch (op) { case INCREMENT: tr = ((Increment)mutation).getTimeRange(); break; case APPEND: tr = ((Append)mutation).getTimeRange(); break; default: break; } List currentValues = get(mutation, store, deltas,null, tr); // Iterate the input columns and update existing values if they were found, otherwise // add new column initialized to the delta amount int currentValuesIndex = 0; for (int i = 0; i < deltas.size(); i++) { Cell delta = deltas.get(i); Cell currentValue = null; if (currentValuesIndex < currentValues.size() && CellUtil.matchingQualifier(currentValues.get(currentValuesIndex), delta)) { currentValue = currentValues.get(currentValuesIndex); if (i < (deltas.size() - 1) && !CellUtil.matchingQualifier(delta, deltas.get(i + 1))) { currentValuesIndex++; } } // Switch on whether this an increment or an append building the new Cell to apply. Cell newCell = null; switch (op) { case INCREMENT: long deltaAmount = getLongValue(delta); final long newValue = currentValue == null ? deltaAmount : getLongValue(currentValue) + deltaAmount; newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, (oldCell) -> Bytes.toBytes(newValue)); break; case APPEND: newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, (oldCell) -> ByteBuffer.wrap(new byte[delta.getValueLength() + oldCell.getValueLength()]) .put(oldCell.getValueArray(), oldCell.getValueOffset(), oldCell.getValueLength()) .put(delta.getValueArray(), delta.getValueOffset(), delta.getValueLength()) .array() ); break; default: throw new UnsupportedOperationException(op.toString()); } if (this.maxCellSize > 0) { int newCellSize = PrivateCellUtil.estimatedSerializedSizeOf(newCell); if (newCellSize > this.maxCellSize) { String msg = "Cell with size " + newCellSize + " exceeds limit of " + this.maxCellSize + " bytes in region " + this; LOG.debug(msg); throw new DoNotRetryIOException(msg); } } cellPairs.add(new Pair<>(currentValue, newCell)); // Add to results to get returned to the Client. If null, cilent does not want results. if (results != null) { results.add(newCell); } } // Give coprocessors a chance to update the new cells before apply to WAL or memstore if (coprocessorHost != null) { // Here the operation must be increment or append. cellPairs = op == Operation.INCREMENT ? coprocessorHost.postIncrementBeforeWAL(mutation, cellPairs) : coprocessorHost.postAppendBeforeWAL(mutation, cellPairs); } return cellPairs.stream().map(Pair::getSecond).collect(Collectors.toList()); } private static Cell reckonDelta(final Cell delta, final Cell currentCell, final byte[] columnFamily, final long now, Mutation mutation, Function supplier) throws IOException { // Forward any tags found on the delta. List tags = TagUtil.carryForwardTags(delta); tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL()); if (currentCell != null) { tags = TagUtil.carryForwardTags(tags, currentCell); byte[] newValue = supplier.apply(currentCell); return ExtendedCellBuilderFactory.create(CellBuilderType.SHALLOW_COPY) .setRow(mutation.getRow(), 0, mutation.getRow().length) .setFamily(columnFamily, 0, columnFamily.length) // copy the qualifier if the cell is located in shared memory. .setQualifier(CellUtil.cloneQualifier(delta)) .setTimestamp(Math.max(currentCell.getTimestamp() + 1, now)) .setType(KeyValue.Type.Put.getCode()) .setValue(newValue, 0, newValue.length) .setTags(TagUtil.fromList(tags)) .build(); } else { PrivateCellUtil.updateLatestStamp(delta, now); return CollectionUtils.isEmpty(tags) ? delta : PrivateCellUtil.createCell(delta, tags); } } /** * @return Get the long out of the passed in Cell */ private static long getLongValue(final Cell cell) throws DoNotRetryIOException { int len = cell.getValueLength(); if (len != Bytes.SIZEOF_LONG) { // throw DoNotRetryIOException instead of IllegalArgumentException throw new DoNotRetryIOException("Field is not a long, it's " + len + " bytes wide"); } return PrivateCellUtil.getValueAsLong(cell); } /** * Do a specific Get on passed columnFamily and column qualifiers. * @param mutation Mutation we are doing this Get for. * @param store Which column family on row (TODO: Go all Gets in one go) * @param coordinates Cells from mutation used as coordinates applied to Get. * @return Return list of Cells found. */ private List get(Mutation mutation, HStore store, List coordinates, IsolationLevel isolation, TimeRange tr) throws IOException { // Sort the cells so that they match the order that they appear in the Get results. Otherwise, // we won't be able to find the existing values if the cells are not specified in order by the // client since cells are in an array list. // TODO: I don't get why we are sorting. St.Ack 20150107 sort(coordinates, store.getComparator()); Get get = new Get(mutation.getRow()); if (isolation != null) { get.setIsolationLevel(isolation); } for (Cell cell: coordinates) { get.addColumn(store.getColumnFamilyDescriptor().getName(), CellUtil.cloneQualifier(cell)); } // Increments carry time range. If an Increment instance, put it on the Get. if (tr != null) { get.setTimeRange(tr.getMin(), tr.getMax()); } return get(get, false); } /** * @return Sorted list of cells using comparator */ private static List sort(List cells, final CellComparator comparator) { cells.sort(comparator); return cells; } // // New HBASE-880 Helpers // void checkFamily(final byte [] family) throws NoSuchColumnFamilyException { if (!this.htableDescriptor.hasColumnFamily(family)) { throw new NoSuchColumnFamilyException("Column family " + Bytes.toString(family) + " does not exist in region " + this + " in table " + this.htableDescriptor); } } public static final long FIXED_OVERHEAD = ClassSize.align( ClassSize.OBJECT + ClassSize.ARRAY + 55 * ClassSize.REFERENCE + 3 * Bytes.SIZEOF_INT + (15 * Bytes.SIZEOF_LONG) + 3 * Bytes.SIZEOF_BOOLEAN); // woefully out of date - currently missing: // 1 x HashMap - coprocessorServiceHandlers // 6 x LongAdder - numMutationsWithoutWAL, dataInMemoryWithoutWAL, // checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount, // writeRequestsCount // 1 x HRegion$WriteState - writestate // 1 x RegionCoprocessorHost - coprocessorHost // 1 x RegionSplitPolicy - splitPolicy // 1 x MetricsRegion - metricsRegion // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper public static final long DEEP_OVERHEAD = FIXED_OVERHEAD + ClassSize.OBJECT + // closeLock (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing (3 * ClassSize.ATOMIC_LONG) + // numPutsWithoutWAL, dataInMemoryWithoutWAL, // compactionsFailed (2 * ClassSize.CONCURRENT_HASHMAP) + // lockedRows, scannerReadPoints WriteState.HEAP_SIZE + // writestate ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock MultiVersionConcurrencyControl.FIXED_SIZE // mvcc + 2 * ClassSize.TREEMAP // maxSeqIdInStores, replicationScopes + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress + ClassSize.STORE_SERVICES // store services + StoreHotnessProtector.FIXED_SIZE ; @Override public long heapSize() { // this does not take into account row locks, recent flushes, mvcc entries, and more return DEEP_OVERHEAD + stores.values().stream().mapToLong(HStore::heapSize).sum(); } /** * Registers a new protocol buffer {@link Service} subclass as a coprocessor endpoint to * be available for handling Region#execService(com.google.protobuf.RpcController, * org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall) calls. * *

* Only a single instance may be registered per region for a given {@link Service} subclass (the * instances are keyed on {@link com.google.protobuf.Descriptors.ServiceDescriptor#getFullName()}. * After the first registration, subsequent calls with the same service name will fail with * a return value of {@code false}. *

* @param instance the {@code Service} subclass instance to expose as a coprocessor endpoint * @return {@code true} if the registration was successful, {@code false} * otherwise */ public boolean registerService(com.google.protobuf.Service instance) { /* * No stacking of instances is allowed for a single service name */ com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType(); String serviceName = CoprocessorRpcUtils.getServiceName(serviceDesc); if (coprocessorServiceHandlers.containsKey(serviceName)) { LOG.error("Coprocessor service {} already registered, rejecting request from {} in region {}", serviceName, instance, this); return false; } coprocessorServiceHandlers.put(serviceName, instance); if (LOG.isDebugEnabled()) { LOG.debug("Registered coprocessor service: region=" + Bytes.toStringBinary(getRegionInfo().getRegionName()) + " service=" + serviceName); } return true; } /** * Executes a single protocol buffer coprocessor endpoint {@link Service} method using * the registered protocol handlers. {@link Service} implementations must be registered via the * {@link #registerService(com.google.protobuf.Service)} * method before they are available. * * @param controller an {@code RpcContoller} implementation to pass to the invoked service * @param call a {@code CoprocessorServiceCall} instance identifying the service, method, * and parameters for the method invocation * @return a protocol buffer {@code Message} instance containing the method's result * @throws IOException if no registered service handler is found or an error * occurs during the invocation * @see #registerService(com.google.protobuf.Service) */ public com.google.protobuf.Message execService(com.google.protobuf.RpcController controller, CoprocessorServiceCall call) throws IOException { String serviceName = call.getServiceName(); com.google.protobuf.Service service = coprocessorServiceHandlers.get(serviceName); if (service == null) { throw new UnknownProtocolException(null, "No registered coprocessor service found for " + serviceName + " in region " + Bytes.toStringBinary(getRegionInfo().getRegionName())); } com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType(); String methodName = call.getMethodName(); com.google.protobuf.Descriptors.MethodDescriptor methodDesc = CoprocessorRpcUtils.getMethodDescriptor(methodName, serviceDesc); com.google.protobuf.Message.Builder builder = service.getRequestPrototype(methodDesc).newBuilderForType(); org.apache.hadoop.hbase.protobuf.ProtobufUtil.mergeFrom(builder, call.getRequest().toByteArray()); com.google.protobuf.Message request = CoprocessorRpcUtils.getRequest(service, methodDesc, call.getRequest()); if (coprocessorHost != null) { request = coprocessorHost.preEndpointInvocation(service, methodName, request); } final com.google.protobuf.Message.Builder responseBuilder = service.getResponsePrototype(methodDesc).newBuilderForType(); service.callMethod(methodDesc, controller, request, new com.google.protobuf.RpcCallback() { @Override public void run(com.google.protobuf.Message message) { if (message != null) { responseBuilder.mergeFrom(message); } } }); if (coprocessorHost != null) { coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder); } IOException exception = org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils.getControllerException(controller); if (exception != null) { throw exception; } return responseBuilder.build(); } boolean shouldForceSplit() { return this.splitRequest; } byte[] getExplicitSplitPoint() { return this.explicitSplitPoint; } void forceSplit(byte[] sp) { // This HRegion will go away after the forced split is successful // But if a forced split fails, we need to clear forced split. this.splitRequest = true; if (sp != null) { this.explicitSplitPoint = sp; } } void clearSplit() { this.splitRequest = false; this.explicitSplitPoint = null; } /** * Return the splitpoint. null indicates the region isn't splittable * If the splitpoint isn't explicitly specified, it will go over the stores * to find the best splitpoint. Currently the criteria of best splitpoint * is based on the size of the store. */ public byte[] checkSplit() { // Can't split META if (this.getRegionInfo().isMetaRegion() || TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable())) { if (shouldForceSplit()) { LOG.warn("Cannot split meta region in HBase 0.20 and above"); } return null; } // Can't split a region that is closing. if (this.isClosing()) { return null; } if (!splitPolicy.shouldSplit()) { return null; } byte[] ret = splitPolicy.getSplitPoint(); if (ret != null) { try { checkRow(ret, "calculated split"); } catch (IOException e) { LOG.error("Ignoring invalid split for region {}", this, e); return null; } } return ret; } /** * @return The priority that this region should have in the compaction queue */ public int getCompactPriority() { return stores.values().stream().mapToInt(HStore::getCompactPriority).min() .orElse(Store.NO_PRIORITY); } /** @return the coprocessor host */ public RegionCoprocessorHost getCoprocessorHost() { return coprocessorHost; } /** @param coprocessorHost the new coprocessor host */ @VisibleForTesting public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) { this.coprocessorHost = coprocessorHost; } @Override public void startRegionOperation() throws IOException { startRegionOperation(Operation.ANY); } @Override public void startRegionOperation(Operation op) throws IOException { switch (op) { case GET: // read operations case SCAN: checkReadsEnabled(); break; default: break; } if (op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION || op == Operation.COMPACT_REGION || op == Operation.COMPACT_SWITCH) { // split, merge or compact region doesn't need to check the closing/closed state or lock the // region return; } if (this.closing.get()) { throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing"); } lock(lock.readLock()); if (this.closed.get()) { lock.readLock().unlock(); throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed"); } // The unit for snapshot is a region. So, all stores for this region must be // prepared for snapshot operation before proceeding. if (op == Operation.SNAPSHOT) { stores.values().forEach(HStore::preSnapshotOperation); } try { if (coprocessorHost != null) { coprocessorHost.postStartRegionOperation(op); } } catch (Exception e) { lock.readLock().unlock(); throw new IOException(e); } } @Override public void closeRegionOperation() throws IOException { closeRegionOperation(Operation.ANY); } @Override public void closeRegionOperation(Operation operation) throws IOException { if (operation == Operation.SNAPSHOT) { stores.values().forEach(HStore::postSnapshotOperation); } lock.readLock().unlock(); if (coprocessorHost != null) { coprocessorHost.postCloseRegionOperation(operation); } } /** * This method needs to be called before any public call that reads or * modifies stores in bulk. It has to be called just before a try. * #closeBulkRegionOperation needs to be called in the try's finally block * Acquires a writelock and checks if the region is closing or closed. * @throws NotServingRegionException when the region is closing or closed * @throws RegionTooBusyException if failed to get the lock in time * @throws InterruptedIOException if interrupted while waiting for a lock */ private void startBulkRegionOperation(boolean writeLockNeeded) throws NotServingRegionException, RegionTooBusyException, InterruptedIOException { if (this.closing.get()) { throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing"); } if (writeLockNeeded) lock(lock.writeLock()); else lock(lock.readLock()); if (this.closed.get()) { if (writeLockNeeded) lock.writeLock().unlock(); else lock.readLock().unlock(); throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed"); } } /** * Closes the lock. This needs to be called in the finally block corresponding * to the try block of #startRegionOperation */ private void closeBulkRegionOperation(){ if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock(); else lock.readLock().unlock(); } /** * Update LongAdders for number of puts without wal and the size of possible data loss. * These information are exposed by the region server metrics. */ private void recordMutationWithoutWal(final Map> familyMap) { numMutationsWithoutWAL.increment(); if (numMutationsWithoutWAL.sum() <= 1) { LOG.info("writing data to region " + this + " with WAL disabled. Data may be lost in the event of a crash."); } long mutationSize = 0; for (List cells: familyMap.values()) { // Optimization: 'foreach' loop is not used. See: // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects assert cells instanceof RandomAccess; int listSize = cells.size(); for (int i=0; i < listSize; i++) { Cell cell = cells.get(i); mutationSize += cell.getSerializedSize(); } } dataInMemoryWithoutWAL.add(mutationSize); } private void lock(final Lock lock) throws RegionTooBusyException, InterruptedIOException { lock(lock, 1); } /** * Try to acquire a lock. Throw RegionTooBusyException * if failed to get the lock in time. Throw InterruptedIOException * if interrupted while waiting for the lock. */ private void lock(final Lock lock, final int multiplier) throws RegionTooBusyException, InterruptedIOException { try { final long waitTime = Math.min(maxBusyWaitDuration, busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier)); if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) { // Don't print millis. Message is used as a key over in // RetriesExhaustedWithDetailsException processing. final String regionName = this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getRegionNameAsString(); final String serverName = this.getRegionServerServices() == null ? "unknown" : (this.getRegionServerServices().getServerName() == null ? "unknown" : this.getRegionServerServices().getServerName().toString()); RegionTooBusyException rtbe = new RegionTooBusyException( "Failed to obtain lock; regionName=" + regionName + ", server=" + serverName); LOG.warn("Region is too busy to allow lock acquisition.", rtbe); throw rtbe; } } catch (InterruptedException ie) { LOG.info("Interrupted while waiting for a lock in region {}", this); InterruptedIOException iie = new InterruptedIOException(); iie.initCause(ie); throw iie; } } /** * Calls sync with the given transaction ID * @param txid should sync up to which transaction * @throws IOException If anything goes wrong with DFS */ private void sync(long txid, Durability durability) throws IOException { if (this.getRegionInfo().isMetaRegion()) { this.wal.sync(txid); } else { switch(durability) { case USE_DEFAULT: // do what table defaults to if (shouldSyncWAL()) { this.wal.sync(txid); } break; case SKIP_WAL: // nothing do to break; case ASYNC_WAL: // nothing do to break; case SYNC_WAL: this.wal.sync(txid, false); break; case FSYNC_WAL: this.wal.sync(txid, true); break; default: throw new RuntimeException("Unknown durability " + durability); } } } /** * Check whether we should sync the wal from the table's durability settings */ private boolean shouldSyncWAL() { return regionDurability.ordinal() > Durability.ASYNC_WAL.ordinal(); } /** * A mocked list implementation - discards all updates. */ private static final List MOCKED_LIST = new AbstractList() { @Override public void add(int index, Cell element) { // do nothing } @Override public boolean addAll(int index, Collection c) { return false; // this list is never changed as a result of an update } @Override public KeyValue get(int index) { throw new UnsupportedOperationException(); } @Override public int size() { return 0; } }; /** @return the latest sequence number that was read from storage when this region was opened */ public long getOpenSeqNum() { return this.openSeqNum; } @Override public Map getMaxStoreSeqId() { return this.maxSeqIdInStores; } public long getOldestSeqIdOfStore(byte[] familyName) { return wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), familyName); } @Override public CompactionState getCompactionState() { boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0; return (hasMajor ? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR) : (hasMinor ? CompactionState.MINOR : CompactionState.NONE)); } public void reportCompactionRequestStart(boolean isMajor){ (isMajor ? majorInProgress : minorInProgress).incrementAndGet(); } public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted) { int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet(); // metrics compactionsFinished.increment(); compactionNumFilesCompacted.add(numFiles); compactionNumBytesCompacted.add(filesSizeCompacted); assert newValue >= 0; } public void reportCompactionRequestFailure() { compactionsFailed.increment(); } public void incrementCompactionsQueuedCount() { compactionsQueued.increment(); } public void decrementCompactionsQueuedCount() { compactionsQueued.decrement(); } public void incrementFlushesQueuedCount() { flushesQueued.increment(); } @VisibleForTesting public long getReadPoint() { return getReadPoint(IsolationLevel.READ_COMMITTED); } /** * {@inheritDoc} */ @Override public void onConfigurationChange(Configuration conf) { this.storeHotnessProtector.update(conf); } /** * {@inheritDoc} */ @Override public void registerChildren(ConfigurationManager manager) { configurationManager = manager; stores.values().forEach(manager::registerObserver); } /** * {@inheritDoc} */ @Override public void deregisterChildren(ConfigurationManager manager) { stores.values().forEach(configurationManager::deregisterObserver); } @Override public CellComparator getCellComparator() { return cellComparator; } public long getMemStoreFlushSize() { return this.memstoreFlushSize; } //// method for debugging tests void throwException(String title, String regionName) { StringBuilder buf = new StringBuilder(); buf.append(title + ", "); buf.append(getRegionInfo().toString()); buf.append(getRegionInfo().isMetaRegion() ? " meta region " : " "); buf.append("stores: "); for (HStore s : stores.values()) { buf.append(s.getColumnFamilyDescriptor().getNameAsString()); buf.append(" size: "); buf.append(s.getMemStoreSize().getDataSize()); buf.append(" "); } buf.append("end-of-stores"); buf.append(", memstore size "); buf.append(getMemStoreDataSize()); if (getRegionInfo().getRegionNameAsString().startsWith(regionName)) { throw new RuntimeException(buf.toString()); } } @Override public void requestCompaction(String why, int priority, boolean major, CompactionLifeCycleTracker tracker) throws IOException { if (major) { stores.values().forEach(HStore::triggerMajorCompaction); } rsServices.getCompactionRequestor().requestCompaction(this, why, priority, tracker, RpcServer.getRequestUser().orElse(null)); } @Override public void requestCompaction(byte[] family, String why, int priority, boolean major, CompactionLifeCycleTracker tracker) throws IOException { HStore store = stores.get(family); if (store == null) { throw new NoSuchColumnFamilyException("column family " + Bytes.toString(family) + " does not exist in region " + getRegionInfo().getRegionNameAsString()); } if (major) { store.triggerMajorCompaction(); } rsServices.getCompactionRequestor().requestCompaction(this, store, why, priority, tracker, RpcServer.getRequestUser().orElse(null)); } private void requestFlushIfNeeded() throws RegionTooBusyException { if(isFlushSize(this.memStoreSizing.getMemStoreSize())) { requestFlush(); } } private void requestFlush() { if (this.rsServices == null) { return; } requestFlush0(FlushLifeCycleTracker.DUMMY); } private void requestFlush0(FlushLifeCycleTracker tracker) { boolean shouldFlush = false; synchronized (writestate) { if (!this.writestate.isFlushRequested()) { shouldFlush = true; writestate.flushRequested = true; } } if (shouldFlush) { // Make request outside of synchronize block; HBASE-818. this.rsServices.getFlushRequester().requestFlush(this, false, tracker); if (LOG.isDebugEnabled()) { LOG.debug("Flush requested on " + this.getRegionInfo().getEncodedName()); } } else { tracker.notExecuted("Flush already requested on " + this); } } @Override public void requestFlush(FlushLifeCycleTracker tracker) throws IOException { requestFlush0(tracker); } /** * This method modifies the region's configuration in order to inject replication-related * features * @param conf region configurations */ static void decorateRegionConfiguration(Configuration conf) { if (ReplicationUtils.isReplicationForBulkLoadDataEnabled(conf)) { String plugins = conf.get(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY,""); String replicationCoprocessorClass = ReplicationObserver.class.getCanonicalName(); if (!plugins.contains(replicationCoprocessorClass)) { conf.set(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, (plugins.equals("") ? "" : (plugins + ",")) + replicationCoprocessorClass); } } } @VisibleForTesting public void setReadRequestsCount(long readRequestsCount) { this.readRequestsCount.add(readRequestsCount); } @VisibleForTesting public void setWriteRequestsCount(long writeRequestsCount) { this.writeRequestsCount.add(writeRequestsCount); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy