org.apache.hadoop.hbase.regionserver.wal.SequenceIdAccounting Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hbase-server Show documentation
Server functionality for HBase
There is a newer version: 3.0.0-beta-1
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.regionserver.wal;

import static org.apache.hadoop.hbase.util.CollectionUtils.computeIfAbsent;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.ImmutableByteArray;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;

/**
 * 
 * Accounting of sequence ids per region and then by column family. So we can our accounting
 * current, call startCacheFlush and then finishedCacheFlush or abortCacheFlush so this instance can
 * keep abreast of the state of sequence id persistence. Also call update per append.
 * 
 * 
 * For the implementation, we assume that all the {@code encodedRegionName} passed in is gotten by
 * {@link org.apache.hadoop.hbase.client.RegionInfo#getEncodedNameAsBytes()}. So it is safe to use
 * it as a hash key. And for family name, we use {@link ImmutableByteArray} as key. This is because
 * hash based map is much faster than RBTree or CSLM and here we are on the critical write path. See
 * HBASE-16278 for more details.
 * 
 */
@InterfaceAudience.Private
class SequenceIdAccounting {

  private static final Logger LOG = LoggerFactory.getLogger(SequenceIdAccounting.class);
  /**
   * This lock ties all operations on {@link SequenceIdAccounting#flushingSequenceIds} and
   * {@link #lowestUnflushedSequenceIds} Maps. {@link #lowestUnflushedSequenceIds} has the
   * lowest outstanding sequence ids EXCEPT when flushing. When we flush, the current
   * lowest set for the region/column family are moved (atomically because of this lock) to
   * {@link #flushingSequenceIds}.
   * 
   * The two Maps are tied by this locking object EXCEPT when we go to update the lowest
   * entry; see {@link #lowestUnflushedSequenceIds}. In here is a putIfAbsent call on
   * {@link #lowestUnflushedSequenceIds}. In this latter case, we will add this lowest
   * sequence id if we find that there is no entry for the current column family. There will be no
   * entry only if we just came up OR we have moved aside current set of lowest sequence ids
   * because the current set are being flushed (by putting them into {@link #flushingSequenceIds}).
   * This is how we pick up the next 'lowest' sequence id per region per column family to be used
   * figuring what is in the next flush.
   */
  private final Object tieLock = new Object();

  /**
   * Map of encoded region names and family names to their OLDEST -- i.e. their first,
   * the longest-lived, their 'earliest', the 'lowest' -- sequence id.
   *
   * 
When we flush, the current lowest sequence ids get cleared and added to
   * {@link #flushingSequenceIds}. The next append that comes in, is then added
   * here to {@link #lowestUnflushedSequenceIds} as the next lowest sequenceid.
   *
   * 
If flush fails, currently server is aborted so no need to restore previous sequence ids.
   * 
Needs to be concurrent Maps because we use putIfAbsent updating oldest.
   */
  private final ConcurrentMap>
    lowestUnflushedSequenceIds = new ConcurrentHashMap<>();

  /**
   * Map of encoded region names and family names to their lowest or OLDEST sequence/edit id
   * currently being flushed out to hfiles. Entries are moved here from
   * {@link #lowestUnflushedSequenceIds} while the lock {@link #tieLock} is held
   * (so movement between the Maps is atomic).
   */
  private final Map> flushingSequenceIds = new HashMap<>();

  /**
   * 

   * Map of region encoded names to the latest/highest region sequence id. Updated on each call to
   * append.
   * 
   * 
   * This map uses byte[] as the key, and uses reference equality. It works in our use case as we
   * use {@link org.apache.hadoop.hbase.client.RegionInfo#getEncodedNameAsBytes()} as keys. For a
   * given region, it always returns the same array.
   * 
   */
  private Map highestSequenceIds = new HashMap<>();

  /**
   * Returns the lowest unflushed sequence id for the region.
   * @param encodedRegionName
   * @return Lowest outstanding unflushed sequenceid for encodedRegionName. Will
   * return {@link HConstants#NO_SEQNUM} when none.
   */
  long getLowestSequenceId(final byte[] encodedRegionName) {
    synchronized (this.tieLock) {
      Map m = this.flushingSequenceIds.get(encodedRegionName);
      long flushingLowest = m != null ? getLowestSequenceId(m) : Long.MAX_VALUE;
      m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
      long unflushedLowest = m != null ? getLowestSequenceId(m) : HConstants.NO_SEQNUM;
      return Math.min(flushingLowest, unflushedLowest);
    }
  }

  /**
   * @param encodedRegionName
   * @param familyName
   * @return Lowest outstanding unflushed sequenceid for encodedRegionname and
   *         familyName. Returned sequenceid may be for an edit currently being
   *         flushed.
   */
  long getLowestSequenceId(final byte[] encodedRegionName, final byte[] familyName) {
    ImmutableByteArray familyNameWrapper = ImmutableByteArray.wrap(familyName);
    synchronized (this.tieLock) {
      Map m = this.flushingSequenceIds.get(encodedRegionName);
      if (m != null) {
        Long lowest = m.get(familyNameWrapper);
        if (lowest != null) {
          return lowest;
        }
      }
      m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
      if (m != null) {
        Long lowest = m.get(familyNameWrapper);
        if (lowest != null) {
          return lowest;
        }
      }
    }
    return HConstants.NO_SEQNUM;
  }

  /**
   * Reset the accounting of highest sequenceid by regionname.
   * @return Return the previous accounting Map of regions to the last sequence id written into
   * each.
   */
  Map resetHighest() {
    Map old = this.highestSequenceIds;
    this.highestSequenceIds = new HashMap<>();
    return old;
  }

  /**
   * We've been passed a new sequenceid for the region. Set it as highest seen for this region and
   * if we are to record oldest, or lowest sequenceids, save it as oldest seen if nothing
   * currently older.
   * @param encodedRegionName
   * @param families
   * @param sequenceid
   * @param lowest Whether to keep running account of oldest sequence id.
   */
  void update(byte[] encodedRegionName, Set families, long sequenceid,
      final boolean lowest) {
    Long l = Long.valueOf(sequenceid);
    this.highestSequenceIds.put(encodedRegionName, l);
    if (lowest) {
      ConcurrentMap m = getOrCreateLowestSequenceIds(encodedRegionName);
      for (byte[] familyName : families) {
        m.putIfAbsent(ImmutableByteArray.wrap(familyName), l);
      }
    }
  }

  /**
   * Update the store sequence id, e.g., upon executing in-memory compaction
   */
  void updateStore(byte[] encodedRegionName, byte[] familyName, Long sequenceId,
      boolean onlyIfGreater) {
    if (sequenceId == null) {
      return;
    }
    Long highest = this.highestSequenceIds.get(encodedRegionName);
    if (highest == null || sequenceId > highest) {
      this.highestSequenceIds.put(encodedRegionName, sequenceId);
    }
    ImmutableByteArray familyNameWrapper = ImmutableByteArray.wrap(familyName);
    synchronized (this.tieLock) {
      ConcurrentMap m = getOrCreateLowestSequenceIds(encodedRegionName);
      boolean replaced = false;
      while (!replaced) {
        Long oldSeqId = m.get(familyNameWrapper);
        if (oldSeqId == null) {
          m.put(familyNameWrapper, sequenceId);
          replaced = true;
        } else if (onlyIfGreater) {
          if (sequenceId > oldSeqId) {
            replaced = m.replace(familyNameWrapper, oldSeqId, sequenceId);
          } else {
            return;
          }
        } else { // replace even if sequence id is not greater than oldSeqId
          m.put(familyNameWrapper, sequenceId);
          return;
        }
      }
    }
  }

  @VisibleForTesting
  ConcurrentMap getOrCreateLowestSequenceIds(byte[] encodedRegionName) {
    // Intentionally, this access is done outside of this.regionSequenceIdLock. Done per append.
    return computeIfAbsent(this.lowestUnflushedSequenceIds, encodedRegionName,
      ConcurrentHashMap::new);
  }

  /**
   * @param sequenceids Map to search for lowest value.
   * @return Lowest value found in sequenceids.
   */
  private static long getLowestSequenceId(Map sequenceids) {
    long lowest = HConstants.NO_SEQNUM;
    for (Long sid: sequenceids.values()) {
      if (lowest == HConstants.NO_SEQNUM || sid.longValue() < lowest) {
        lowest = sid.longValue();
      }
    }
    return lowest;
  }

  /**
   * @param src
   * @return New Map that has same keys as src but instead of a Map for a value, it
   *         instead has found the smallest sequence id and it returns that as the value instead.
   */
  private > Map flattenToLowestSequenceId(Map src) {
    if (src == null || src.isEmpty()) {
      return null;
    }
    Map tgt = new HashMap<>();
    for (Map.Entry entry : src.entrySet()) {
      long lowestSeqId = getLowestSequenceId(entry.getValue());
      if (lowestSeqId != HConstants.NO_SEQNUM) {
        tgt.put(entry.getKey(), lowestSeqId);
      }
    }
    return tgt;
  }

  /**
   * @param encodedRegionName Region to flush.
   * @param families Families to flush. May be a subset of all families in the region.
   * @return Returns {@link HConstants#NO_SEQNUM} if we are flushing the whole region OR if
   * we are flushing a subset of all families but there are no edits in those families not
   * being flushed; in other words, this is effectively same as a flush of all of the region
   * though we were passed a subset of regions. Otherwise, it returns the sequence id of the
   * oldest/lowest outstanding edit.
   */
  Long startCacheFlush(final byte[] encodedRegionName, final Set families) {
    Map familytoSeq = new HashMap<>();
    for (byte[] familyName : families){
      familytoSeq.put(familyName,HConstants.NO_SEQNUM);
    }
    return startCacheFlush(encodedRegionName,familytoSeq);
  }

  Long startCacheFlush(final byte[] encodedRegionName, final Map familyToSeq) {
    Map oldSequenceIds = null;
    Long lowestUnflushedInRegion = HConstants.NO_SEQNUM;
    synchronized (tieLock) {
      Map m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
      if (m != null) {
        // NOTE: Removal from this.lowestUnflushedSequenceIds must be done in controlled
        // circumstance because another concurrent thread now may add sequenceids for this family
        // (see above in getOrCreateLowestSequenceId). Make sure you are ok with this. Usually it
        // is fine because updates are blocked when this method is called. Make sure!!!
        for (Map.Entry entry : familyToSeq.entrySet()) {
          ImmutableByteArray familyNameWrapper = ImmutableByteArray.wrap((byte[]) entry.getKey());
          Long seqId = null;
          if(entry.getValue() == HConstants.NO_SEQNUM) {
            seqId = m.remove(familyNameWrapper);
          } else {
            seqId = m.replace(familyNameWrapper, entry.getValue());
          }
          if (seqId != null) {
            if (oldSequenceIds == null) {
              oldSequenceIds = new HashMap<>();
            }
            oldSequenceIds.put(familyNameWrapper, seqId);
          }
        }
        if (oldSequenceIds != null && !oldSequenceIds.isEmpty()) {
          if (this.flushingSequenceIds.put(encodedRegionName, oldSequenceIds) != null) {
            LOG.warn("Flushing Map not cleaned up for " + Bytes.toString(encodedRegionName) +
              ", sequenceid=" + oldSequenceIds);
          }
        }
        if (m.isEmpty()) {
          // Remove it otherwise it will be in oldestUnflushedStoreSequenceIds for ever
          // even if the region is already moved to other server.
          // Do not worry about data racing, we held write lock of region when calling
          // startCacheFlush, so no one can add value to the map we removed.
          this.lowestUnflushedSequenceIds.remove(encodedRegionName);
        } else {
          // Flushing a subset of the region families. Return the sequence id of the oldest entry.
          lowestUnflushedInRegion = Collections.min(m.values());
        }
      }
    }
    // Do this check outside lock.
    if (oldSequenceIds != null && oldSequenceIds.isEmpty()) {
      // TODO: if we have no oldStoreSeqNum, and WAL is not disabled, presumably either
      // the region is already flushing (which would make this call invalid), or there
      // were no appends after last flush, so why are we starting flush? Maybe we should
      // assert not empty. Less rigorous, but safer, alternative is telling the caller to stop.
      // For now preserve old logic.
      LOG.warn("Couldn't find oldest sequenceid for " + Bytes.toString(encodedRegionName));
    }
    return lowestUnflushedInRegion;
  }

  void completeCacheFlush(final byte[] encodedRegionName) {
    synchronized (tieLock) {
      this.flushingSequenceIds.remove(encodedRegionName);
    }
  }

  void abortCacheFlush(final byte[] encodedRegionName) {
    // Method is called when we are crashing down because failed write flush AND it is called
    // if we fail prepare. The below is for the fail prepare case; we restore the old sequence ids.
    Map flushing = null;
    Map tmpMap = new HashMap<>();
    // Here we are moving sequenceids from flushing back to unflushed; doing opposite of what
    // happened in startCacheFlush. During prepare phase, we have update lock on the region so
    // no edits should be coming in via append.
    synchronized (tieLock) {
      flushing = this.flushingSequenceIds.remove(encodedRegionName);
      if (flushing != null) {
        Map unflushed = getOrCreateLowestSequenceIds(encodedRegionName);
        for (Map.Entry e: flushing.entrySet()) {
          // Set into unflushed the 'old' oldest sequenceid and if any value in flushed with this
          // value, it will now be in tmpMap.
          tmpMap.put(e.getKey(), unflushed.put(e.getKey(), e.getValue()));
        }
      }
    }

    // Here we are doing some 'test' to see if edits are going in out of order. What is it for?
    // Carried over from old code.
    if (flushing != null) {
      for (Map.Entry e : flushing.entrySet()) {
        Long currentId = tmpMap.get(e.getKey());
        if (currentId != null && currentId.longValue() < e.getValue().longValue()) {
          String errorStr = Bytes.toString(encodedRegionName) + " family "
              + e.getKey().toStringUtf8() + " acquired edits out of order current memstore seq="
              + currentId + ", previous oldest unflushed id=" + e.getValue();
          LOG.error(errorStr);
          Runtime.getRuntime().halt(1);
        }
      }
    }
  }

  /**
   * See if passed sequenceids are lower -- i.e. earlier -- than any outstanding
   * sequenceids, sequenceids we are holding on to in this accounting instance.
   * @param sequenceids Keyed by encoded region name. Cannot be null (doesn't make sense for it to
   *          be null).
   * @return true if all sequenceids are lower, older than, the old sequenceids in this instance.
   */
  boolean areAllLower(Map sequenceids) {
    Map flushing = null;
    Map unflushed = null;
    synchronized (this.tieLock) {
      // Get a flattened -- only the oldest sequenceid -- copy of current flushing and unflushed
      // data structures to use in tests below.
      flushing = flattenToLowestSequenceId(this.flushingSequenceIds);
      unflushed = flattenToLowestSequenceId(this.lowestUnflushedSequenceIds);
    }
    for (Map.Entry e : sequenceids.entrySet()) {
      long oldestFlushing = Long.MAX_VALUE;
      long oldestUnflushed = Long.MAX_VALUE;
      if (flushing != null && flushing.containsKey(e.getKey())) {
        oldestFlushing = flushing.get(e.getKey());
      }
      if (unflushed != null && unflushed.containsKey(e.getKey())) {
        oldestUnflushed = unflushed.get(e.getKey());
      }
      long min = Math.min(oldestFlushing, oldestUnflushed);
      if (min <= e.getValue()) {
        return false;
      }
    }
    return true;
  }

  /**
   * Iterates over the given Map and compares sequence ids with corresponding entries in
   * {@link #lowestUnflushedSequenceIds}. If a region in
   * {@link #lowestUnflushedSequenceIds} has a sequence id less than that passed in
   * sequenceids then return it.
   * @param sequenceids Sequenceids keyed by encoded region name.
   * @return regions found in this instance with sequence ids less than those passed in.
   */
  byte[][] findLower(Map sequenceids) {
    List toFlush = null;
    // Keeping the old behavior of iterating unflushedSeqNums under oldestSeqNumsLock.
    synchronized (tieLock) {
      for (Map.Entry e : sequenceids.entrySet()) {
        Map m = this.lowestUnflushedSequenceIds.get(e.getKey());
        if (m == null) {
          continue;
        }
        // The lowest sequence id outstanding for this region.
        long lowest = getLowestSequenceId(m);
        if (lowest != HConstants.NO_SEQNUM && lowest <= e.getValue()) {
          if (toFlush == null) {
            toFlush = new ArrayList<>();
          }
          toFlush.add(e.getKey());
        }
      }
    }
    return toFlush == null ? null : toFlush.toArray(new byte[0][]);
  }
}