org.apache.hadoop.hbase.regionserver.wal.SequenceIdAccounting Maven / Gradle / Ivy
Show all versions of hbase-server Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver.wal;
import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.stream.Collectors;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.ImmutableByteArray;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Accounting of sequence ids per region and then by column family. So we can keep our accounting
* current, call startCacheFlush and then finishedCacheFlush or abortCacheFlush so this instance can
* keep abreast of the state of sequence id persistence. Also call update per append.
*
* For the implementation, we assume that all the {@code encodedRegionName} passed in are gotten by
* {@link org.apache.hadoop.hbase.client.RegionInfo#getEncodedNameAsBytes()}. So it is safe to use
* it as a hash key. And for family name, we use {@link ImmutableByteArray} as key. This is because
* hash based map is much faster than RBTree or CSLM and here we are on the critical write path. See
* HBASE-16278 for more details.
*
*/
@InterfaceAudience.Private
class SequenceIdAccounting {
private static final Logger LOG = LoggerFactory.getLogger(SequenceIdAccounting.class);
/**
* This lock ties all operations on {@link SequenceIdAccounting#flushingSequenceIds} and
* {@link #lowestUnflushedSequenceIds} Maps. {@link #lowestUnflushedSequenceIds} has the lowest
* outstanding sequence ids EXCEPT when flushing. When we flush, the current lowest set for the
* region/column family are moved (atomically because of this lock) to
* {@link #flushingSequenceIds}.
*
* The two Maps are tied by this locking object EXCEPT when we go to update the lowest entry; see
* {@link #lowestUnflushedSequenceIds}. In here is a putIfAbsent call on
* {@link #lowestUnflushedSequenceIds}. In this latter case, we will add this lowest sequence id
* if we find that there is no entry for the current column family. There will be no entry only if
* we just came up OR we have moved aside current set of lowest sequence ids because the current
* set are being flushed (by putting them into {@link #flushingSequenceIds}). This is how we pick
* up the next 'lowest' sequence id per region per column family to be used figuring what is in
* the next flush.
*/
private final Object tieLock = new Object();
/**
* Map of encoded region names and family names to their OLDEST -- i.e. their first, the
* longest-lived, their 'earliest', the 'lowest' -- sequence id.
*
* When we flush, the current lowest sequence ids get cleared and added to
* {@link #flushingSequenceIds}. The next append that comes in, is then added here to
* {@link #lowestUnflushedSequenceIds} as the next lowest sequenceid.
*
* If flush fails, currently server is aborted so no need to restore previous sequence ids.
*
* Needs to be concurrent Maps because we use putIfAbsent updating oldest.
*/
private final ConcurrentMap> lowestUnflushedSequenceIds = new ConcurrentHashMap<>();
/**
* Map of encoded region names and family names to their lowest or OLDEST sequence/edit id
* currently being flushed out to hfiles. Entries are moved here from
* {@link #lowestUnflushedSequenceIds} while the lock {@link #tieLock} is held (so movement
* between the Maps is atomic).
*/
private final Map> flushingSequenceIds = new HashMap<>();
/**
*
* Map of region encoded names to the latest/highest region sequence id. Updated on each call to
* append.
*
*
* This map uses byte[] as the key, and uses reference equality. It works in our use case as we
* use {@link org.apache.hadoop.hbase.client.RegionInfo#getEncodedNameAsBytes()} as keys. For a
* given region, it always returns the same array.
*
*/
private Map highestSequenceIds = new HashMap<>();
/**
* Returns the lowest unflushed sequence id for the region.
* @return Lowest outstanding unflushed sequenceid for encodedRegionName
. Will return
* {@link HConstants#NO_SEQNUM} when none.
*/
long getLowestSequenceId(final byte[] encodedRegionName) {
synchronized (this.tieLock) {
Map, Long> m = this.flushingSequenceIds.get(encodedRegionName);
long flushingLowest = m != null ? getLowestSequenceId(m) : Long.MAX_VALUE;
m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
long unflushedLowest = m != null ? getLowestSequenceId(m) : HConstants.NO_SEQNUM;
return Math.min(flushingLowest, unflushedLowest);
}
}
/**
* @return Lowest outstanding unflushed sequenceid for encodedRegionname
and
* familyName
. Returned sequenceid may be for an edit currently being
* flushed.
*/
long getLowestSequenceId(final byte[] encodedRegionName, final byte[] familyName) {
ImmutableByteArray familyNameWrapper = ImmutableByteArray.wrap(familyName);
synchronized (this.tieLock) {
Map m = this.flushingSequenceIds.get(encodedRegionName);
if (m != null) {
Long lowest = m.get(familyNameWrapper);
if (lowest != null) {
return lowest;
}
}
m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
if (m != null) {
Long lowest = m.get(familyNameWrapper);
if (lowest != null) {
return lowest;
}
}
}
return HConstants.NO_SEQNUM;
}
/**
* Reset the accounting of highest sequenceid by regionname.
* @return Return the previous accounting Map of regions to the last sequence id written into
* each.
*/
Map resetHighest() {
Map old = this.highestSequenceIds;
this.highestSequenceIds = new HashMap<>();
return old;
}
/**
* We've been passed a new sequenceid for the region. Set it as highest seen for this region and
* if we are to record oldest, or lowest sequenceids, save it as oldest seen if nothing currently
* older. nnn * @param lowest Whether to keep running account of oldest sequence id.
*/
void update(byte[] encodedRegionName, Set families, long sequenceid,
final boolean lowest) {
Long l = Long.valueOf(sequenceid);
this.highestSequenceIds.put(encodedRegionName, l);
if (lowest) {
ConcurrentMap m = getOrCreateLowestSequenceIds(encodedRegionName);
for (byte[] familyName : families) {
m.putIfAbsent(ImmutableByteArray.wrap(familyName), l);
}
}
}
/**
* Clear all the records of the given region as it is going to be closed.
*
* We will call this once we get the region close marker. We need this because that, if we use
* Durability.ASYNC_WAL, after calling startCacheFlush, we may still get some ongoing wal entries
* that has not been processed yet, this will lead to orphan records in the
* lowestUnflushedSequenceIds and then cause too many WAL files.
*
* See HBASE-23157 for more details.
*/
void onRegionClose(byte[] encodedRegionName) {
synchronized (tieLock) {
this.lowestUnflushedSequenceIds.remove(encodedRegionName);
Map flushing = this.flushingSequenceIds.remove(encodedRegionName);
if (flushing != null) {
LOG.warn("Still have flushing records when closing {}, {}",
Bytes.toString(encodedRegionName),
flushing.entrySet().stream().map(e -> e.getKey().toString() + "->" + e.getValue())
.collect(Collectors.joining(",", "{", "}")));
}
}
this.highestSequenceIds.remove(encodedRegionName);
}
/**
* Update the store sequence id, e.g., upon executing in-memory compaction
*/
void updateStore(byte[] encodedRegionName, byte[] familyName, Long sequenceId,
boolean onlyIfGreater) {
if (sequenceId == null) {
return;
}
Long highest = this.highestSequenceIds.get(encodedRegionName);
if (highest == null || sequenceId > highest) {
this.highestSequenceIds.put(encodedRegionName, sequenceId);
}
ImmutableByteArray familyNameWrapper = ImmutableByteArray.wrap(familyName);
synchronized (this.tieLock) {
ConcurrentMap m = getOrCreateLowestSequenceIds(encodedRegionName);
boolean replaced = false;
while (!replaced) {
Long oldSeqId = m.get(familyNameWrapper);
if (oldSeqId == null) {
m.put(familyNameWrapper, sequenceId);
replaced = true;
} else if (onlyIfGreater) {
if (sequenceId > oldSeqId) {
replaced = m.replace(familyNameWrapper, oldSeqId, sequenceId);
} else {
return;
}
} else { // replace even if sequence id is not greater than oldSeqId
m.put(familyNameWrapper, sequenceId);
return;
}
}
}
}
ConcurrentMap getOrCreateLowestSequenceIds(byte[] encodedRegionName) {
// Intentionally, this access is done outside of this.regionSequenceIdLock. Done per append.
return computeIfAbsent(this.lowestUnflushedSequenceIds, encodedRegionName,
ConcurrentHashMap::new);
}
/**
* @param sequenceids Map to search for lowest value.
* @return Lowest value found in sequenceids
.
*/
private static long getLowestSequenceId(Map, Long> sequenceids) {
long lowest = HConstants.NO_SEQNUM;
for (Map.Entry, Long> entry : sequenceids.entrySet()) {
if (entry.getKey().toString().equals("METAFAMILY")) {
continue;
}
Long sid = entry.getValue();
if (lowest == HConstants.NO_SEQNUM || sid.longValue() < lowest) {
lowest = sid.longValue();
}
}
return lowest;
}
/**
* n * @return New Map that has same keys as src
but instead of a Map for a value, it
* instead has found the smallest sequence id and it returns that as the value instead.
*/
private > Map flattenToLowestSequenceId(Map src) {
if (src == null || src.isEmpty()) {
return null;
}
Map tgt = new HashMap<>();
for (Map.Entry entry : src.entrySet()) {
long lowestSeqId = getLowestSequenceId(entry.getValue());
if (lowestSeqId != HConstants.NO_SEQNUM) {
tgt.put(entry.getKey(), lowestSeqId);
}
}
return tgt;
}
/**
* @param encodedRegionName Region to flush.
* @param families Families to flush. May be a subset of all families in the region.
* @return Returns {@link HConstants#NO_SEQNUM} if we are flushing the whole region OR if we are
* flushing a subset of all families but there are no edits in those families not being
* flushed; in other words, this is effectively same as a flush of all of the region
* though we were passed a subset of regions. Otherwise, it returns the sequence id of the
* oldest/lowest outstanding edit.
*/
Long startCacheFlush(final byte[] encodedRegionName, final Set families) {
Map familytoSeq = new HashMap<>();
for (byte[] familyName : families) {
familytoSeq.put(familyName, HConstants.NO_SEQNUM);
}
return startCacheFlush(encodedRegionName, familytoSeq);
}
Long startCacheFlush(final byte[] encodedRegionName, final Map familyToSeq) {
Map oldSequenceIds = null;
Long lowestUnflushedInRegion = HConstants.NO_SEQNUM;
synchronized (tieLock) {
Map m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
if (m != null) {
// NOTE: Removal from this.lowestUnflushedSequenceIds must be done in controlled
// circumstance because another concurrent thread now may add sequenceids for this family
// (see above in getOrCreateLowestSequenceId). Make sure you are ok with this. Usually it
// is fine because updates are blocked when this method is called. Make sure!!!
for (Map.Entry entry : familyToSeq.entrySet()) {
ImmutableByteArray familyNameWrapper = ImmutableByteArray.wrap((byte[]) entry.getKey());
Long seqId = null;
if (entry.getValue() == HConstants.NO_SEQNUM) {
seqId = m.remove(familyNameWrapper);
} else {
seqId = m.replace(familyNameWrapper, entry.getValue());
}
if (seqId != null) {
if (oldSequenceIds == null) {
oldSequenceIds = new HashMap<>();
}
oldSequenceIds.put(familyNameWrapper, seqId);
}
}
if (oldSequenceIds != null && !oldSequenceIds.isEmpty()) {
if (this.flushingSequenceIds.put(encodedRegionName, oldSequenceIds) != null) {
LOG.warn("Flushing Map not cleaned up for " + Bytes.toString(encodedRegionName)
+ ", sequenceid=" + oldSequenceIds);
}
}
if (m.isEmpty()) {
// Remove it otherwise it will be in oldestUnflushedStoreSequenceIds for ever
// even if the region is already moved to other server.
// Do not worry about data racing, we held write lock of region when calling
// startCacheFlush, so no one can add value to the map we removed.
this.lowestUnflushedSequenceIds.remove(encodedRegionName);
} else {
// Flushing a subset of the region families. Return the sequence id of the oldest entry.
lowestUnflushedInRegion = Collections.min(m.values());
}
}
}
// Do this check outside lock.
if (oldSequenceIds != null && oldSequenceIds.isEmpty()) {
// TODO: if we have no oldStoreSeqNum, and WAL is not disabled, presumably either
// the region is already flushing (which would make this call invalid), or there
// were no appends after last flush, so why are we starting flush? Maybe we should
// assert not empty. Less rigorous, but safer, alternative is telling the caller to stop.
// For now preserve old logic.
LOG.warn("Couldn't find oldest sequenceid for " + Bytes.toString(encodedRegionName));
}
return lowestUnflushedInRegion;
}
void completeCacheFlush(byte[] encodedRegionName, long maxFlushedSeqId) {
// This is a simple hack to avoid maxFlushedSeqId go backwards.
// The system works fine normally, but if we make use of Durability.ASYNC_WAL and we are going
// to flush all the stores, the maxFlushedSeqId will be next seq id of the region, but we may
// still have some unsynced WAL entries in the ringbuffer after we call startCacheFlush, and
// then it will be recorded as the lowestUnflushedSeqId by the above update method, which is
// less than the current maxFlushedSeqId. And if next time we only flush the family with this
// unusual lowestUnflushedSeqId, the maxFlushedSeqId will go backwards.
// This is an unexpected behavior so we should fix it, otherwise it may cause unexpected
// behavior in other area.
// The solution here is a bit hack but fine. Just replace the lowestUnflushedSeqId with
// maxFlushedSeqId + 1 if it is lesser. The meaning of maxFlushedSeqId is that, all edits less
// than or equal to it have been flushed, i.e, persistent to HFile, so set
// lowestUnflushedSequenceId to maxFlushedSeqId + 1 will not cause data loss.
// And technically, using +1 is fine here. If the maxFlushesSeqId is just the flushOpSeqId, it
// means we have flushed all the stores so the seq id for actual data should be at least plus 1.
// And if we do not flush all the stores, then the maxFlushedSeqId is calculated by
// lowestUnflushedSeqId - 1, so here let's plus the 1 back.
Long wrappedSeqId = Long.valueOf(maxFlushedSeqId + 1);
synchronized (tieLock) {
this.flushingSequenceIds.remove(encodedRegionName);
Map unflushed = lowestUnflushedSequenceIds.get(encodedRegionName);
if (unflushed == null) {
return;
}
for (Map.Entry e : unflushed.entrySet()) {
if (e.getValue().longValue() <= maxFlushedSeqId) {
e.setValue(wrappedSeqId);
}
}
}
}
void abortCacheFlush(final byte[] encodedRegionName) {
// Method is called when we are crashing down because failed write flush AND it is called
// if we fail prepare. The below is for the fail prepare case; we restore the old sequence ids.
Map flushing = null;
Map tmpMap = new HashMap<>();
// Here we are moving sequenceids from flushing back to unflushed; doing opposite of what
// happened in startCacheFlush. During prepare phase, we have update lock on the region so
// no edits should be coming in via append.
synchronized (tieLock) {
flushing = this.flushingSequenceIds.remove(encodedRegionName);
if (flushing != null) {
Map unflushed = getOrCreateLowestSequenceIds(encodedRegionName);
for (Map.Entry e : flushing.entrySet()) {
// Set into unflushed the 'old' oldest sequenceid and if any value in flushed with this
// value, it will now be in tmpMap.
tmpMap.put(e.getKey(), unflushed.put(e.getKey(), e.getValue()));
}
}
}
// Here we are doing some 'test' to see if edits are going in out of order. What is it for?
// Carried over from old code.
if (flushing != null) {
for (Map.Entry e : flushing.entrySet()) {
Long currentId = tmpMap.get(e.getKey());
if (currentId != null && currentId.longValue() < e.getValue().longValue()) {
String errorStr = Bytes.toString(encodedRegionName) + " family " + e.getKey().toString()
+ " acquired edits out of order current memstore seq=" + currentId
+ ", previous oldest unflushed id=" + e.getValue();
LOG.error(errorStr);
Runtime.getRuntime().halt(1);
}
}
}
}
/**
* See if passed sequenceids
are lower -- i.e. earlier -- than any outstanding
* sequenceids, sequenceids we are holding on to in this accounting instance.
* @param sequenceids Keyed by encoded region name. Cannot be null (doesn't make sense for it to
* be null).
* @return true if all sequenceids are lower, older than, the old sequenceids in this instance.
*/
boolean areAllLower(Map sequenceids) {
Map flushing = null;
Map unflushed = null;
synchronized (this.tieLock) {
// Get a flattened -- only the oldest sequenceid -- copy of current flushing and unflushed
// data structures to use in tests below.
flushing = flattenToLowestSequenceId(this.flushingSequenceIds);
unflushed = flattenToLowestSequenceId(this.lowestUnflushedSequenceIds);
}
for (Map.Entry e : sequenceids.entrySet()) {
long oldestFlushing = Long.MAX_VALUE;
long oldestUnflushed = Long.MAX_VALUE;
if (flushing != null && flushing.containsKey(e.getKey())) {
oldestFlushing = flushing.get(e.getKey());
}
if (unflushed != null && unflushed.containsKey(e.getKey())) {
oldestUnflushed = unflushed.get(e.getKey());
}
long min = Math.min(oldestFlushing, oldestUnflushed);
if (min <= e.getValue()) {
return false;
}
}
return true;
}
/**
* Iterates over the given Map and compares sequence ids with corresponding entries in
* {@link #lowestUnflushedSequenceIds}. If a region in {@link #lowestUnflushedSequenceIds} has a
* sequence id less than that passed in sequenceids
then return it.
* @param sequenceids Sequenceids keyed by encoded region name.
* @return stores of regions found in this instance with sequence ids less than those passed in.
*/
Map> findLower(Map sequenceids) {
Map> toFlush = null;
// Keeping the old behavior of iterating unflushedSeqNums under oldestSeqNumsLock.
synchronized (tieLock) {
for (Map.Entry e : sequenceids.entrySet()) {
Map m = this.lowestUnflushedSequenceIds.get(e.getKey());
if (m == null) {
continue;
}
for (Map.Entry me : m.entrySet()) {
if (me.getValue() <= e.getValue()) {
if (toFlush == null) {
toFlush = new TreeMap(Bytes.BYTES_COMPARATOR);
}
toFlush.computeIfAbsent(e.getKey(), k -> new ArrayList<>())
.add(Bytes.toBytes(me.getKey().toString()));
}
}
}
}
return toFlush;
}
}