org.apache.hadoop.hbase.regionserver.MemStoreLABImpl Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hbase-server Show documentation
Server functionality for HBase
There is a newer version: 3.0.0-beta-1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.regionserver;

import com.google.errorprone.annotations.RestrictedApi;
import java.nio.ByteBuffer;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ByteBufferExtendedCell;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.ExtendedCell;
import org.apache.hadoop.hbase.KeyValueUtil;
import org.apache.hadoop.hbase.nio.RefCnt;
import org.apache.hadoop.hbase.regionserver.CompactingMemStore.IndexType;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;

/**
 * A memstore-local allocation buffer.
 * 
 * The MemStoreLAB is basically a bump-the-pointer allocator that allocates big (2MB) byte[] chunks
 * from and then doles it out to threads that request slices into the array.
 * 

 * The purpose of this class is to combat heap fragmentation in the regionserver. By ensuring that
 * all Cells in a given memstore refer only to large chunks of contiguous memory, we ensure that
 * large blocks get freed up when the memstore is flushed.
 * 

 * Without the MSLAB, the byte array allocated during insertion end up interleaved throughout the
 * heap, and the old generation gets progressively more fragmented until a stop-the-world compacting
 * collection occurs.
 * 

 * TODO: we should probably benchmark whether word-aligning the allocations would provide a
 * performance improvement - probably would speed up the Bytes.toLong/Bytes.toInt calls in KeyValue,
 * but some of those are cached anyway. The chunks created by this MemStoreLAB can get pooled at
 * {@link ChunkCreator}. When the Chunk comes from pool, it can be either an on heap or an off heap
 * backed chunk. The chunks, which this MemStoreLAB creates on its own (when no chunk available from
 * pool), those will be always on heap backed.
 * 
 * NOTE:if user requested to work with MSLABs (whether on- or off-heap), in
 * {@link CompactingMemStore} ctor, the {@link CompactingMemStore#indexType} could only be
 * {@link IndexType#CHUNK_MAP},that is to say the immutable segments using MSLABs are going to use
 * {@link CellChunkMap} as their index.
 */
@InterfaceAudience.Private
public class MemStoreLABImpl implements MemStoreLAB {

  static final Logger LOG = LoggerFactory.getLogger(MemStoreLABImpl.class);

  private AtomicReference currChunk = new AtomicReference<>();
  // Lock to manage multiple handlers requesting for a chunk
  private ReentrantLock lock = new ReentrantLock();

  // A set of chunks contained by this memstore LAB
  Set chunks = new ConcurrentSkipListSet();
  private final int dataChunkSize;
  private final int maxAlloc;
  private final ChunkCreator chunkCreator;

  // This flag is for closing this instance, its set when clearing snapshot of
  // memstore
  private final AtomicBoolean closed = new AtomicBoolean(false);;
  // This flag is for reclaiming chunks. Its set when putting chunks back to
  // pool
  private final AtomicBoolean reclaimed = new AtomicBoolean(false);
  /**
   * Its initial value is 1, so it is one bigger than the current count of open scanners which
   * reading data from this MemStoreLAB.
   */
  private final RefCnt refCnt;

  // Used in testing
  public MemStoreLABImpl() {
    this(new Configuration());
  }

  public MemStoreLABImpl(Configuration conf) {
    dataChunkSize = conf.getInt(CHUNK_SIZE_KEY, CHUNK_SIZE_DEFAULT);
    maxAlloc = conf.getInt(MAX_ALLOC_KEY, MAX_ALLOC_DEFAULT);
    this.chunkCreator = ChunkCreator.getInstance();
    // if we don't exclude allocations >CHUNK_SIZE, we'd infiniteloop on one!
    Preconditions.checkArgument(maxAlloc <= dataChunkSize,
      MAX_ALLOC_KEY + " must be less than " + CHUNK_SIZE_KEY);

    this.refCnt = RefCnt.create(() -> {
      recycleChunks();
    });

  }

  @Override
  public Cell copyCellInto(Cell cell) {
    // See head of copyBBECellInto for how it differs from copyCellInto
    return (cell instanceof ByteBufferExtendedCell)
      ? copyBBECellInto((ByteBufferExtendedCell) cell, maxAlloc)
      : copyCellInto(cell, maxAlloc);
  }

  /**
   * When a cell's size is too big (bigger than maxAlloc), copyCellInto does not allocate it on
   * MSLAB. Since the process of flattening to CellChunkMap assumes that all cells are allocated on
   * MSLAB, during this process, the big cells are copied into MSLAB using this method.
   */
  @Override
  public Cell forceCopyOfBigCellInto(Cell cell) {
    int size = Segment.getCellLength(cell);
    Preconditions.checkArgument(size >= 0, "negative size");
    if (size + ChunkCreator.SIZEOF_CHUNK_HEADER <= dataChunkSize) {
      // Using copyCellInto for cells which are bigger than the original maxAlloc
      return copyCellInto(cell, dataChunkSize);
    } else {
      Chunk c = getNewExternalChunk(size);
      int allocOffset = c.alloc(size);
      return copyToChunkCell(cell, c.getData(), allocOffset, size);
    }
  }

  /**
   * Mostly a duplicate of {@link #copyCellInto(Cell, int)}} done for perf sake. It presumes
   * ByteBufferExtendedCell instead of Cell so we deal with a specific type rather than the super
   * generic Cell. Removes instanceof checks. Shrinkage is enough to make this inline where before
   * it was too big. Uses less CPU. See HBASE-20875 for evidence.
   * @see #copyCellInto(Cell, int)
   */
  private Cell copyBBECellInto(ByteBufferExtendedCell cell, int maxAlloc) {
    int size = cell.getSerializedSize();
    Preconditions.checkArgument(size >= 0, "negative size");
    // Callers should satisfy large allocations from JVM heap so limit fragmentation.
    if (size > maxAlloc) {
      return null;
    }
    Chunk c = null;
    int allocOffset = 0;
    while (true) {
      // Try to get the chunk
      c = getOrMakeChunk();
      // We may get null because the some other thread succeeded in getting the lock
      // and so the current thread has to try again to make its chunk or grab the chunk
      // that the other thread created
      // Try to allocate from this chunk
      if (c != null) {
        allocOffset = c.alloc(size);
        if (allocOffset != -1) {
          // We succeeded - this is the common case - small alloc
          // from a big buffer
          break;
        }
        // not enough space!
        // try to retire this chunk
        tryRetireChunk(c);
      }
    }
    return copyBBECToChunkCell(cell, c.getData(), allocOffset, size);
  }

  /**
   * @see #copyBBECellInto(ByteBufferExtendedCell, int)
   */
  private Cell copyCellInto(Cell cell, int maxAlloc) {
    int size = Segment.getCellLength(cell);
    Preconditions.checkArgument(size >= 0, "negative size");
    // Callers should satisfy large allocations directly from JVM since they
    // don't cause fragmentation as badly.
    if (size > maxAlloc) {
      return null;
    }
    Chunk c = null;
    int allocOffset = 0;
    while (true) {
      // Try to get the chunk
      c = getOrMakeChunk();
      // we may get null because the some other thread succeeded in getting the lock
      // and so the current thread has to try again to make its chunk or grab the chunk
      // that the other thread created
      // Try to allocate from this chunk
      if (c != null) {
        allocOffset = c.alloc(size);
        if (allocOffset != -1) {
          // We succeeded - this is the common case - small alloc
          // from a big buffer
          break;
        }
        // not enough space!
        // try to retire this chunk
        tryRetireChunk(c);
      }
    }
    return copyToChunkCell(cell, c.getData(), allocOffset, size);
  }

  /**
   * Clone the passed cell by copying its data into the passed buf and create a cell with a chunkid
   * out of it
   * @see #copyBBECToChunkCell(ByteBufferExtendedCell, ByteBuffer, int, int)
   */
  private static Cell copyToChunkCell(Cell cell, ByteBuffer buf, int offset, int len) {
    int tagsLen = cell.getTagsLength();
    if (cell instanceof ExtendedCell) {
      ((ExtendedCell) cell).write(buf, offset);
    } else {
      // Normally all Cell impls within Server will be of type ExtendedCell. Just considering the
      // other case also. The data fragments within Cell is copied into buf as in KeyValue
      // serialization format only.
      KeyValueUtil.appendTo(cell, buf, offset, true);
    }
    return createChunkCell(buf, offset, len, tagsLen, cell.getSequenceId());
  }

  /**
   * Clone the passed cell by copying its data into the passed buf and create a cell with a chunkid
   * out of it
   * @see #copyToChunkCell(Cell, ByteBuffer, int, int)
   */
  private static Cell copyBBECToChunkCell(ByteBufferExtendedCell cell, ByteBuffer buf, int offset,
    int len) {
    int tagsLen = cell.getTagsLength();
    cell.write(buf, offset);
    return createChunkCell(buf, offset, len, tagsLen, cell.getSequenceId());
  }

  private static Cell createChunkCell(ByteBuffer buf, int offset, int len, int tagsLen,
    long sequenceId) {
    // TODO : write the seqid here. For writing seqId we should create a new cell type so
    // that seqId is not used as the state
    if (tagsLen == 0) {
      // When tagsLen is 0, make a NoTagsByteBufferKeyValue version. This is an optimized class
      // which directly return tagsLen as 0. So we avoid parsing many length components in
      // reading the tagLength stored in the backing buffer. The Memstore addition of every Cell
      // call getTagsLength().
      return new NoTagByteBufferChunkKeyValue(buf, offset, len, sequenceId);
    } else {
      return new ByteBufferChunkKeyValue(buf, offset, len, sequenceId);
    }
  }

  /**
   * Close this instance since it won't be used any more, try to put the chunks back to pool
   */
  @Override
  public void close() {
    if (!this.closed.compareAndSet(false, true)) {
      return;
    }
    // We could put back the chunks to pool for reusing only when there is no
    // opening scanner which will read their data
    this.refCnt.release();
  }

  @RestrictedApi(explanation = "Should only be called in tests", link = "",
      allowedOnPath = ".*/src/test/.*")
  int getRefCntValue() {
    return this.refCnt.refCnt();
  }

  /**
   * Called when opening a scanner on the data of this MemStoreLAB
   */
  @Override
  public void incScannerCount() {
    this.refCnt.retain();
  }

  /**
   * Called when closing a scanner on the data of this MemStoreLAB
   */
  @Override
  public void decScannerCount() {
    this.refCnt.release();
  }

  private void recycleChunks() {
    if (reclaimed.compareAndSet(false, true)) {
      chunkCreator.putbackChunks(chunks);
      chunks.clear();
    }
  }

  /**
   * Try to retire the current chunk if it is still c. Postcondition is that
   * curChunk.get() != c
   * @param c the chunk to retire
   */
  private void tryRetireChunk(Chunk c) {
    currChunk.compareAndSet(c, null);
    // If the CAS succeeds, that means that we won the race
    // to retire the chunk. We could use this opportunity to
    // update metrics on external fragmentation.
    //
    // If the CAS fails, that means that someone else already
    // retired the chunk for us.
  }

  /**
   * Get the current chunk, or, if there is no current chunk, allocate a new one from the JVM.
   */
  private Chunk getOrMakeChunk() {
    // Try to get the chunk
    Chunk c = currChunk.get();
    if (c != null) {
      return c;
    }
    // No current chunk, so we want to allocate one. We race
    // against other allocators to CAS in an uninitialized chunk
    // (which is cheap to allocate)
    if (lock.tryLock()) {
      try {
        // once again check inside the lock
        c = currChunk.get();
        if (c != null) {
          return c;
        }
        c = this.chunkCreator.getChunk();
        if (c != null) {
          // set the curChunk. No need of CAS as only one thread will be here
          currChunk.set(c);
          chunks.add(c.getId());
          return c;
        }
      } finally {
        lock.unlock();
      }
    }
    return null;
  }

  /*
   * Returning a new pool chunk, without replacing current chunk, meaning MSLABImpl does not make
   * the returned chunk as CurChunk. The space on this chunk will be allocated externally. The
   * interface is only for external callers.
   */
  @Override
  public Chunk getNewExternalChunk(ChunkCreator.ChunkType chunkType) {
    switch (chunkType) {
      case INDEX_CHUNK:
      case DATA_CHUNK:
        Chunk c = this.chunkCreator.getChunk(chunkType);
        chunks.add(c.getId());
        return c;
      case JUMBO_CHUNK: // a jumbo chunk doesn't have a fixed size
      default:
        return null;
    }
  }

  /*
   * Returning a new chunk, without replacing current chunk, meaning MSLABImpl does not make the
   * returned chunk as CurChunk. The space on this chunk will be allocated externally. The interface
   * is only for external callers. Chunks from pools are not allocated from here, since they have
   * fixed sizes
   */
  @Override
  public Chunk getNewExternalChunk(int size) {
    int allocSize = size + ChunkCreator.SIZEOF_CHUNK_HEADER;
    if (allocSize <= ChunkCreator.getInstance().getChunkSize()) {
      return getNewExternalChunk(ChunkCreator.ChunkType.DATA_CHUNK);
    } else {
      Chunk c = this.chunkCreator.getJumboChunk(size);
      chunks.add(c.getId());
      return c;
    }
  }

  @Override
  public boolean isOnHeap() {
    return !isOffHeap();
  }

  @Override
  public boolean isOffHeap() {
    return this.chunkCreator.isOffheap();
  }

  Chunk getCurrentChunk() {
    return currChunk.get();
  }

  BlockingQueue getPooledChunks() {
    BlockingQueue pooledChunks = new LinkedBlockingQueue<>();
    for (Integer id : this.chunks) {
      Chunk chunk = chunkCreator.getChunk(id);
      if (chunk != null && chunk.isFromPool()) {
        pooledChunks.add(chunk);
      }
    }
    return pooledChunks;
  }

  Integer getNumOfChunksReturnedToPool(Set chunksId) {
    int i = 0;
    for (Integer id : chunksId) {
      if (chunkCreator.isChunkInPool(id)) {
        i++;
      }
    }
    return i;
  }

  boolean isReclaimed() {
    return reclaimed.get();
  }

  boolean isClosed() {
    return closed.get();
  }
}