All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.llap.cache.LowLevelCacheImpl Maven / Gradle / Ivy

There is a newer version: 4.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.llap.cache;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Predicate;

import org.apache.hadoop.hive.common.io.Allocator;
import org.apache.hadoop.hive.common.io.CacheTag;
import org.apache.hadoop.hive.common.io.DataCache.BooleanRef;
import org.apache.hadoop.hive.common.io.DataCache.DiskRangeListFactory;
import org.apache.hadoop.hive.common.io.DiskRange;
import org.apache.hadoop.hive.common.io.DiskRangeList;
import org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper;
import org.apache.hadoop.hive.common.io.encoded.MemoryBuffer;
import org.apache.hadoop.hive.llap.io.api.impl.LlapIoImpl;
import org.apache.hadoop.hive.llap.metrics.LlapDaemonCacheMetrics;
import org.apache.hive.common.util.Ref;
import org.apache.orc.impl.RecordReaderUtils;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;

public class LowLevelCacheImpl implements LowLevelCache, BufferUsageManager, LlapIoDebugDump {
  private static final int DEFAULT_CLEANUP_INTERVAL = 600;
  private final Allocator allocator;
  private final AtomicInteger newEvictions = new AtomicInteger(0);
  private Thread cleanupThread = null;
  // TODO: given the specific data and lookups, perhaps the nested thing should not be a map
  //       In fact, CSLM has slow single-threaded operation, and one file is probably often read
  //       by just one (or few) threads, so a much more simple DS with locking might be better.
  //       Let's use CSLM for now, since it's available.
  private final ConcurrentHashMap>> cache = new ConcurrentHashMap<>();
  private final LowLevelCachePolicy cachePolicy;
  private final long cleanupInterval;
  private final LlapDaemonCacheMetrics metrics;
  private final boolean doAssumeGranularBlocks;

  private static final Function> CACHE_CTOR =
      new Function>() {
        @Override
        public ConcurrentSkipListMap apply(Void input) {
          return new ConcurrentSkipListMap<>();
        }
      };

  public LowLevelCacheImpl(LlapDaemonCacheMetrics metrics, LowLevelCachePolicy cachePolicy,
      Allocator allocator, boolean doAssumeGranularBlocks) {
    this(metrics, cachePolicy, allocator, doAssumeGranularBlocks, DEFAULT_CLEANUP_INTERVAL);
  }

  @VisibleForTesting
  LowLevelCacheImpl(LlapDaemonCacheMetrics metrics, LowLevelCachePolicy cachePolicy,
      Allocator allocator, boolean doAssumeGranularBlocks, long cleanupInterval) {
    LlapIoImpl.LOG.info("Low level cache; cleanup interval {} sec", cleanupInterval);
    this.cachePolicy = cachePolicy;
    this.allocator = allocator;
    this.cleanupInterval = cleanupInterval;
    this.metrics = metrics;
    this.doAssumeGranularBlocks = doAssumeGranularBlocks;
  }

  public void startThreads() {
    if (cleanupInterval < 0) return;
    cleanupThread = new CleanupThread(cache, newEvictions, cleanupInterval);
    cleanupThread.start();
  }

  @Override
  public DiskRangeList getFileData(Object fileKey, DiskRangeList ranges, long baseOffset,
      DiskRangeListFactory factory, LowLevelCacheCounters qfCounters, BooleanRef gotAllData) {
    if (ranges == null) return null;
    DiskRangeList prev = ranges.prev;
    FileCache> subCache = cache.get(fileKey);
    if (subCache == null || !subCache.incRef()) {
      long totalMissed = ranges.getTotalLength();
      metrics.incrCacheRequestedBytes(totalMissed);
      if (qfCounters != null) {
        qfCounters.recordCacheMiss(totalMissed);
      }
      if (prev != null && gotAllData != null) {
        gotAllData.value = false;
      }
      return ranges;
    }
    try {
      if (prev == null) {
        prev = new MutateHelper(ranges);
      }
      if (gotAllData != null) {
        gotAllData.value = true;
      }
      DiskRangeList current = ranges;
      while (current != null) {
        metrics.incrCacheRequestedBytes(current.getLength());
        // We assume ranges in "ranges" are non-overlapping; thus, we will save next in advance.
        DiskRangeList next = current.next;
        getOverlappingRanges(baseOffset, current, subCache.getCache(), factory, gotAllData);
        current = next;
      }
    } finally {
      subCache.decRef();
    }
    boolean isInvalid = false;
    if (qfCounters != null) {
      DiskRangeList current = prev.next;
      long bytesHit = 0, bytesMissed = 0;
      while (current != null) {
        // This assumes no ranges passed to cache to fetch have data beforehand.
        if (current.hasData()) {
          bytesHit += current.getLength();
        } else {
          if (gotAllData.value) {
            isInvalid = true;
          }
          bytesMissed += current.getLength();
        }
        current = current.next;
      }
      qfCounters.recordCacheHit(bytesHit);
      qfCounters.recordCacheMiss(bytesMissed);
    } else if (gotAllData != null && gotAllData.value) {
      DiskRangeList current = prev.next;
      while (current != null) {
        if (!current.hasData()) {
          isInvalid = true;
          break;
        }
        current = current.next;
      }
    }
    if (isInvalid) {
      StringBuilder invalidMsg = new StringBuilder(
          "Internal error - gotAllData=true but the resulting ranges are ").append(
              RecordReaderUtils.stringifyDiskRanges(prev.next));
      subCache = cache.get(fileKey);
      if (subCache != null && subCache.incRef()) {
        try {
          invalidMsg.append("; cache ranges (not necessarily consistent) are ");
          for (Map.Entry e : subCache.getCache().entrySet()) {
            long start = e.getKey(), end = start + e.getValue().declaredCachedLength;
            invalidMsg.append("[").append(start).append(", ").append(end).append("), ");
          }
        } finally {
          subCache.decRef();
        }
      } else {
        invalidMsg.append("; cache ranges can no longer be determined");
      }
      String s = invalidMsg.toString();
      LlapIoImpl.LOG.error(s);
      throw new RuntimeException(s);
    }
    return prev.next;
  }

  @Override
  public long markBuffersForProactiveEviction(Predicate predicate, boolean isInstantDeallocation) {
    long markedBytes = 0;
    // Proactive eviction does not need to be perfectly accurate - the iterator returned here might be missing some
    // concurrent inserts / removals but it's fine for us here.
    Collection>> fileCaches = cache.values();
    for (FileCache> fileCache : fileCaches) {
      if (predicate.test(fileCache.getTag()) && fileCache.incRef()) {
        // Locked on subcache so that the async cleaner won't pull out buffers in an unlikely scenario.
        try {
          Collection buffers = fileCache.getCache().values();
          for (LlapDataBuffer buffer : buffers) {
            markedBytes += buffer.markForEviction();
            if (isInstantDeallocation) {
              allocator.deallocate(buffer);
            }
          }
        } finally {
          fileCache.decRef();
        }
      }
    }
    return markedBytes;
  }

  private void getOverlappingRanges(long baseOffset, DiskRangeList currentNotCached,
      ConcurrentSkipListMap cache, DiskRangeListFactory factory,
      BooleanRef gotAllData) {
    long absOffset = currentNotCached.getOffset() + baseOffset;
    if (!doAssumeGranularBlocks) {
      // This currently only happens in tests. See getFileData comment on the interface.
      Long prevOffset = cache.floorKey(absOffset);
      if (prevOffset != null) {
        absOffset = prevOffset;
      }
    }
    Iterator> matches = cache.subMap(
        absOffset, currentNotCached.getEnd() + baseOffset)
        .entrySet().iterator();
    long cacheEnd = -1;
    while (matches.hasNext()) {
      assert currentNotCached != null;
      Map.Entry e = matches.next();
      LlapDataBuffer buffer = e.getValue();
      long requestedLength = currentNotCached.getLength();
      // Lock the buffer, validate it and add to results.
      if (LlapIoImpl.LOCKING_LOGGER.isTraceEnabled()) {
        LlapIoImpl.LOCKING_LOGGER.trace("Locking {} during get", buffer);
      }

      if (!lockBuffer(buffer, true)) {
        // If we cannot lock, remove this from cache and continue.
        matches.remove();
        if (gotAllData != null) {
          gotAllData.value = false;
        }
        continue;
      }
      long cacheOffset = e.getKey();
      if (cacheEnd > cacheOffset) { // compare with old cacheEnd
        throw new AssertionError("Cache has overlapping buffers: " + cacheEnd + ") and ["
            + cacheOffset + ", " + (cacheOffset + buffer.declaredCachedLength) + ")");
      }
      cacheEnd = cacheOffset + buffer.declaredCachedLength;
      DiskRangeList currentCached = factory.createCacheChunk(buffer,
          cacheOffset - baseOffset, cacheEnd - baseOffset);
      currentNotCached = addCachedBufferToIter(currentNotCached, currentCached, gotAllData);
      metrics.incrCacheHitBytes(Math.min(requestedLength, currentCached.getLength()));
    }
    if (currentNotCached != null) {
      assert !currentNotCached.hasData(); // Assumes no ranges passed to cache to read have data.
      if (gotAllData != null) {
        gotAllData.value = false;
      }
    }
  }

  /**
   * Adds cached buffer to buffer list.
   * @param currentNotCached Pointer to the list node where we are inserting.
   * @param currentCached The cached buffer found for this node, to insert.
   * @return The new currentNotCached pointer, following the cached buffer insertion.
   */
  private DiskRangeList addCachedBufferToIter(
      DiskRangeList currentNotCached, DiskRangeList currentCached, BooleanRef gotAllData) {
    if (currentNotCached.getOffset() >= currentCached.getOffset()) {
      // Cached buffer has the same (or lower) offset as the requested buffer.
      if (currentNotCached.getEnd() <= currentCached.getEnd()) {
        // Replace the entire current DiskRange with new cached range.
        // In case of an inexact match in either of the below it may throw. We do not currently
        // support the case where the caller requests a single cache buffer via multiple smaller
        // sub-ranges; if that happens, this may throw. Noone does it now, though.
        // TODO: should we actively assert here for cache buffers larger than range?
        currentNotCached.replaceSelfWith(currentCached);
        return null;
      } else {
        // This cache range is a prefix of the requested one; the above also applies.
        // The cache may still contain the rest of the requested range, so don't set gotAllData.
        currentNotCached.insertPartBefore(currentCached);
        return currentNotCached;
      }
    }

    // Some part of the requested range is not cached - the cached offset is past the requested.
    if (gotAllData != null) {
      gotAllData.value = false;
    }
    if (currentNotCached.getEnd() <= currentCached.getEnd()) {
      // The cache buffer comprises the tail of the requested range (and possibly overshoots it).
      // The same as above applies - may throw if cache buffer is larger than the requested range,
      // and there's another range after this that starts in the middle of this cache buffer.
      // Currently, we cache at exact offsets, so the latter should never happen.
      currentNotCached.insertPartAfter(currentCached);
      return null;  // No more matches expected.
    } else {
      // The cached buffer is in the middle of the requested range.
      // The remaining tail of the latter may still be available further.
      DiskRangeList tail = new DiskRangeList(currentCached.getEnd(), currentNotCached.getEnd());
      currentNotCached.insertPartAfter(currentCached);
      currentCached.insertAfter(tail);
      return tail;
    }
  }

  private boolean lockBuffer(LlapDataBuffer buffer, boolean doNotifyPolicy) {
    int rc = buffer.incRef();
    if (rc > 0) {
      metrics.incrCacheNumLockedBuffers();
    }
    if (doNotifyPolicy && rc == 1) {
      // We have just locked a buffer that wasn't previously locked.
      cachePolicy.notifyLock(buffer);
    }
    return rc > 0;
  }

  @Override
  public long[] putFileData(Object fileKey, DiskRange[] ranges, MemoryBuffer[] buffers,
      long baseOffset, Priority priority, LowLevelCacheCounters qfCounters, CacheTag tag) {
    long[] result = null;
    assert buffers.length == ranges.length;
    FileCache> subCache =
        FileCache.getOrAddFileSubCache(cache, fileKey, CACHE_CTOR, tag);
    try {
      for (int i = 0; i < ranges.length; ++i) {
        LlapDataBuffer buffer = (LlapDataBuffer)buffers[i];
        if (LlapIoImpl.LOCKING_LOGGER.isTraceEnabled()) {
          LlapIoImpl.LOCKING_LOGGER.trace("Locking {} at put time", buffer);
        }
        boolean canLock = lockBuffer(buffer, false);
        assert canLock;
        long offset = ranges[i].getOffset() + baseOffset;
        assert buffer.declaredCachedLength == LlapDataBuffer.UNKNOWN_CACHED_LENGTH;
        buffer.declaredCachedLength = ranges[i].getLength();
        while (true) { // Overwhelmingly executes once, or maybe twice (replacing stale value).
          LlapDataBuffer oldVal = subCache.getCache().putIfAbsent(offset, buffer);
          if (oldVal == null) {
            buffer.setStart(offset);
            buffer.setFileCache(subCache);
            // Cached successfully, add to policy.
            cachePolicy.cache(buffer, priority);
            if (qfCounters != null) {
              qfCounters.recordAllocBytes(buffer.byteBuffer.remaining(), buffer.allocSize);
            }
            break;
          }
          if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
            LlapIoImpl.CACHE_LOGGER.trace("Trying to cache when the chunk is already cached for" +
                " {}@{} (base {}); old {}, new {}", fileKey, offset, baseOffset, oldVal, buffer);
          }

          if (LlapIoImpl.LOCKING_LOGGER.isTraceEnabled()) {
            LlapIoImpl.LOCKING_LOGGER.trace("Locking {} due to cache collision", oldVal);
          }
          if (lockBuffer(oldVal, true)) {
            // We don't do proper overlap checking because it would cost cycles and we
            // think it will never happen. We do perform the most basic check here.
            if (oldVal.declaredCachedLength != buffer.declaredCachedLength) {
              throw new RuntimeException("Found a block with different length at the same offset: "
                  + oldVal.declaredCachedLength + " vs " + buffer.declaredCachedLength + " @" + offset
                  + " (base " + baseOffset + ")");
            }
            // We found an old, valid block for this key in the cache.
            if (LlapIoImpl.LOCKING_LOGGER.isTraceEnabled()) {
              LlapIoImpl.LOCKING_LOGGER.trace("Unlocking {} due to cache collision with {}",
                  buffer, oldVal);
            }

            // This is always set to ranges[i].getLength() prior inserting into the map to avoid inconsistency with the
            // check above. However once we decided that this new buffer will not be cached, we should unset
            // declaredCachedLength, so that it can be instantly deallocated at unlockBuffer()'s else branch.
            buffer.declaredCachedLength = LlapDataBuffer.UNKNOWN_CACHED_LENGTH;
            unlockBuffer(buffer, false);
            buffers[i] = oldVal;
            if (result == null) {
              result = new long[align64(buffers.length) >>> 6];
            }
            result[i >>> 6] |= (1L << (i & 63)); // indicate that we've replaced the value
            break;
          }
          // We found some old value but couldn't incRef it; remove it.
          subCache.getCache().remove(offset, oldVal);
        }
      }
    } finally {
      subCache.decRef();
    }
    return result;
  }

  private static int align64(int number) {
    return ((number + 63) & ~63);
  }

  @Override
  public void decRefBuffer(MemoryBuffer buffer) {
    unlockBuffer((LlapDataBuffer)buffer, true);
  }

  @Override
  public void decRefBuffers(List cacheBuffers) {
    for (MemoryBuffer b : cacheBuffers) {
      unlockBuffer((LlapDataBuffer)b, true);
    }
  }

  private void unlockBuffer(LlapDataBuffer buffer, boolean handleLastDecRef) {
    boolean isLastDecref = (buffer.decRef() == 0);
    if (handleLastDecRef && isLastDecref) {
      // This is kind of not pretty, but this is how we detect whether buffer was cached.
      // We would always set this for lookups at put time.
      if (buffer.declaredCachedLength != LlapDataBuffer.UNKNOWN_CACHED_LENGTH) {
        cachePolicy.notifyUnlock(buffer);
      } else {
        if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
          LlapIoImpl.CACHE_LOGGER.trace("Deallocating {} that was not cached", buffer);
        }
        allocator.deallocate(buffer);
      }
    }
    metrics.decrCacheNumLockedBuffers();
  }

  private static final ByteBuffer fakeBuf = ByteBuffer.wrap(new byte[1]);
  public static LlapDataBuffer allocateFake() {
    LlapDataBuffer fake = new LlapDataBuffer();
    fake.initialize(fakeBuf, 0, 1);
    return fake;
  }

  @Override
  public final void notifyEvicted(MemoryBuffer buffer) {
    newEvictions.incrementAndGet();
  }

  private static final class CleanupThread
    extends FileCacheCleanupThread> {

    public CleanupThread(ConcurrentHashMap>> fileMap,
        AtomicInteger newEvictions, long cleanupInterval) {
      super("Llap low level cache cleanup thread", fileMap, newEvictions, cleanupInterval);
    }

    @Override
    protected int getCacheSize( FileCache> fc) {
      return fc.getCache().size();
    }

    @Override
    public int cleanUpOneFileCache(
        FileCache> fc,
        int leftToCheck, long endTime, Ref isPastEndTime)
        throws InterruptedException {
      // Iterate thru the file cache. This is best-effort.
      Iterator> subIter = fc.getCache().entrySet().iterator();
      while (subIter.hasNext()) {
        long time = -1;
        isPastEndTime.value = isPastEndTime.value || ((time = System.nanoTime()) >= endTime);
        Thread.sleep(((leftToCheck <= 0) || isPastEndTime.value)
            ? 1 : (endTime - time) / (1000000L * leftToCheck));
        if (subIter.next().getValue().isInvalid()) {
          subIter.remove();
        }
        --leftToCheck;
      }
      return leftToCheck;
    }
  }

  @Override
  public boolean incRefBuffer(MemoryBuffer buffer) {
    // notifyReused implies that buffer is already locked; it's also called once for new
    // buffers that are not cached yet. Don't notify cache policy.
    return lockBuffer(((LlapDataBuffer)buffer), false);
  }

  @Override
  public Allocator getAllocator() {
    return allocator;
  }

  @Override
  public void debugDumpShort(StringBuilder sb) {
    sb.append("\nORC cache state ");
    int allLocked = 0, allUnlocked = 0, allEvicted = 0, allMoving = 0, allMarked = 0;
    long totalUsedSpace = 0;
    for (Map.Entry>> e :
      cache.entrySet()) {
      if (!e.getValue().incRef()) continue;
      try {
        int fileLocked = 0, fileUnlocked = 0, fileEvicted = 0, fileMoving = 0, fileMarked = 0;
        long fileMemoryUsage = 0;
        if (e.getValue().getCache().isEmpty()) continue;
        List lockedBufs = null;
        if (LlapIoImpl.LOCKING_LOGGER.isTraceEnabled()) {
          lockedBufs = new ArrayList<>();
        }
        for (Map.Entry e2 : e.getValue().getCache().entrySet()) {
          int newRc = e2.getValue().tryIncRef();
          if (newRc < 0) {
            if (newRc == LlapAllocatorBuffer.INCREF_EVICTED) {
              ++fileEvicted;
            } else if (newRc == LlapAllocatorBuffer.INCREF_FAILED) {
              ++fileMoving;
            }
            continue;
          }
          if (e2.getValue().isMarkedForEviction()) {
            ++fileMarked;
          }
          try {
            if (newRc > 1) { // We hold one refcount.
              ++fileLocked;
              if (lockedBufs != null) {
                lockedBufs.add(e2.getValue());
              }
            } else {
              ++fileUnlocked;
            }
          } finally {
            fileMemoryUsage += e2.getValue().allocSize;
            e2.getValue().decRef();
          }
        }
        allLocked += fileLocked;
        allUnlocked += fileUnlocked;
        allEvicted += fileEvicted;
        allMoving += fileMoving;
        allMarked += fileMarked;
        totalUsedSpace += fileMemoryUsage;

        sb.append("\n  file "
            + e.getKey()
            + ": "
            + fileLocked
            + " locked, "
            + fileUnlocked
            + " unlocked, "
            + fileEvicted
            + " evicted, "
            + fileMoving
            + " being moved, "
            + fileMarked
            + " marked for eviction, "
            + fileMemoryUsage
            + " total used byte");
        if (fileLocked > 0 && LlapIoImpl.LOCKING_LOGGER.isTraceEnabled()) {
          LlapIoImpl.LOCKING_LOGGER.trace("locked-buffers: {}", lockedBufs);
        }
      } finally {
        e.getValue().decRef();
      }
    }
    sb.append("\nORC cache summary: "
        + allLocked
        + " locked, "
        + allUnlocked
        + " unlocked, "
        + allEvicted
        + " evicted, "
        + allMoving
        + " being moved, "
        + allMarked
        + " marked for eviction, "
        + totalUsedSpace
        + " total used space");
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy