All Downloads are FREE. Search and download functionalities are using the official Maven repository.

alluxio.underfs.UfsInputStreamCache Maven / Gradle / Ivy

There is a newer version: 313
Show newest version
/*
 * The Alluxio Open Foundation licenses this work under the Apache License, version 2.0
 * (the "License"). You may not use this work except in compliance with the License, which is
 * available at www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied, as more fully set forth in the License.
 *
 * See the NOTICE file distributed with this work for information regarding copyright ownership.
 */

package alluxio.underfs;

import alluxio.Constants;
import alluxio.conf.Configuration;
import alluxio.conf.PropertyKey;
import alluxio.exception.runtime.AlluxioRuntimeException;
import alluxio.file.FileId;
import alluxio.underfs.options.OpenOptions;
import alluxio.util.IdUtils;
import alluxio.util.executor.ExecutorServiceFactories;
import alluxio.util.logging.SamplingLogger;
import alluxio.worker.block.io.BlockReader;

import com.google.common.base.Preconditions;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.RemovalListener;
import com.google.common.cache.RemovalListeners;
import com.google.common.cache.RemovalNotification;
import com.google.common.util.concurrent.UncheckedExecutionException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import javax.annotation.concurrent.ThreadSafe;

/**
 * This class implements a {@link BlockReader} to read a block directly from UFS, and
 * optionally cache the block to the Alluxio worker if the whole block it is read.
 */
@ThreadSafe
public final class UfsInputStreamCache {
  private static final Logger LOG = LoggerFactory.getLogger(UfsInputStreamCache.class);
  private static final Logger SAMPLING_LOG = new SamplingLogger(LOG, 10L * Constants.MINUTE_MS);
  private static final boolean CACHE_ENABLED =
      Configuration.getBoolean(PropertyKey.WORKER_UFS_INSTREAM_CACHE_ENABLED);

  /**
   * A map from the ufs file id to the metadata of the input streams. Synchronization on this map
   * before access.
   */
  private final Map mFileIdToStreamIds;
  /** Cache of the input streams, from the input stream id to the input stream. */
  private final Cache mStreamCache;

  /**
   * Constructs a new UFS input stream cache.
   */
  public UfsInputStreamCache() {
    mFileIdToStreamIds = new ConcurrentHashMap<>();
    /* Thread pool for asynchronously removing the expired input streams. */
    ExecutorService removalThreadPool =
        ExecutorServiceFactories.fixedThreadPool(Constants.UFS_INPUT_STREAM_CACHE_EXPIRATION, 2)
            .create();

    // A listener to the input stream removal.
    RemovalListener listener =
        (RemovalNotification removal) -> {
          CachedSeekableInputStream inputStream = removal.getValue();
          final FileId fileId = inputStream.getFileId();
          final long resourceId = removal.getKey();
          boolean shouldClose = false;

          StreamIdSet streamIds = mFileIdToStreamIds.get(fileId);
          if (streamIds == null) {
            LOG.warn(
                "Removed UFS input stream (fileId: {} resourceId: {}) but does not exist",
                fileId, resourceId);
          } else {
            synchronized (streamIds) {
              // remove the key
              if (streamIds.removeInUse(resourceId)) {
                LOG.warn("Removed in-use UFS input stream (fileId: {} resourceId: {})", fileId,
                    resourceId);
              }
              if (streamIds.removeAvailable(resourceId)) {
                // close the resource
                LOG.debug("Removed available UFS input stream (fileId: {} resourceId: {})", fileId,
                    resourceId);
                shouldClose = true;
              }
              if (streamIds.isEmpty()) {
                // remove the value from the mapping
                mFileIdToStreamIds.remove(fileId);
              }
            }
          }

          if (shouldClose) {
            try {
              inputStream.close();
            } catch (IOException e) {
              LOG.warn("Failed to close UFS input stream resource of file {} with file id {}"
                      + " and resource id {}", inputStream.getFilePath(), inputStream.getFileId(),
                  resourceId);
            }
          }
        };
    mStreamCache = CacheBuilder.newBuilder()
        .maximumSize(Configuration.getInt(PropertyKey.WORKER_UFS_INSTREAM_CACHE_MAX_SIZE))
        .expireAfterAccess(
            Configuration.getMs(PropertyKey.WORKER_UFS_INSTREAM_CACHE_EXPIRARTION_TIME),
            TimeUnit.MILLISECONDS)
        .removalListener(RemovalListeners.asynchronous(listener, removalThreadPool)).build();
  }

  /**
   * Releases an input stream. The input stream is closed if it's already expired.
   *
   * @param inputStream the input stream to release
   * @throws IOException when input stream fails to close
   */
  public void release(InputStream inputStream) {
    if (!(inputStream instanceof CachedSeekableInputStream) || !CACHE_ENABLED) {
      // for non-seekable input stream, close and return
      close(inputStream);
      return;
    }

    CachedSeekableInputStream cachedStream = (CachedSeekableInputStream) inputStream;
    FileId fileId = cachedStream.getFileId();
    long resourceId = cachedStream.getResourceId();
    StreamIdSet streamIds = mFileIdToStreamIds.get(fileId);

    if (streamIds == null) {
      // the cache no longer tracks this input stream
      LOG.debug("UFS input stream (fileId: {} resourceId: {}) is already expired", fileId,
          resourceId);
      close(inputStream);
      return;
    }

    if (!streamIds.release(resourceId)) {
      LOG.debug("Close the expired UFS input stream (fileId: {} resourceId: {})", fileId,
          resourceId);
      // the input stream expired, close it
      close(inputStream);
    }
  }

  private void close(InputStream in) {
    try {
      in.close();
    } catch (IOException e) {
      throw AlluxioRuntimeException.from(e);
    }
  }

  /**
   * Acquires an input stream. For seekable input streams, if there is an available input stream in
   * the cache, reuse it and repositions the offset, otherwise the manager opens a new input stream.
   *
   * @param ufs the under file system
   * @param path the path to the under storage file
   * @param fileId the file id
   * @param openOptions the open options
   * @return the acquired input stream
   * @throws IOException if the input stream fails to open
   */
  public InputStream acquire(UnderFileSystem ufs, String path, FileId fileId,
      OpenOptions openOptions) throws IOException {
    if (!ufs.isSeekable() || !CACHE_ENABLED) {
      // only seekable UFSes are cachable/reusable, always return a new input stream
      return ufs.openExistingFile(path, openOptions);
    }

    // explicit cache cleanup
    try {
      mStreamCache.cleanUp();
    } catch (Throwable error) {
      SAMPLING_LOG.warn("Explicit cache removal failed.", error);
    }

    StreamIdSet streamIds = mFileIdToStreamIds.compute(fileId, (key, value) -> {
      if (value != null) {
        return value;
      }
      return new StreamIdSet();
    });

    // Try to acquire an existing id from the stream id set.
    // synchronized is required to be consistent between availableIds() and acquire(id).
    CachedSeekableInputStream inputStream = null;
    synchronized (streamIds) {
      // find the next available input stream from the cache
      for (long id : streamIds.availableIds()) {
        inputStream = mStreamCache.getIfPresent(id);
        if (inputStream != null) {
          // acquire it now while locked, so other threads cannot take it
          streamIds.acquire(id);
          break;
        }
      }
    }

    if (inputStream != null) {
      // for the cached ufs instream, seek (outside of critical section) to the requested position.
      LOG.debug("Reused the under file input stream resource of {}", inputStream.getResourceId());
      inputStream.seek(openOptions.getOffset());
      return inputStream;
    }

    // no cached input stream is available, acquire a new id and open a new stream
    final long newId = streamIds.acquireNewId();
    try {
      inputStream = mStreamCache.get(newId, () -> {
        SeekableUnderFileInputStream ufsStream = (SeekableUnderFileInputStream) ufs
            .openExistingFile(path,
                OpenOptions.defaults().setPositionShort(openOptions.getPositionShort())
                    .setOffset(openOptions.getOffset()));
        LOG.debug("Created the under file input stream resource of {}", newId);
        return new CachedSeekableInputStream(ufsStream, newId, fileId, path);
      });
    } catch (ExecutionException e) {
      LOG.warn("Failed to create a new cached ufs instream of file id {} and path {}", fileId,
          path, e);
      // fall back to an uncached ufs creation.
      return ufs.openExistingFile(path,
          OpenOptions.defaults().setOffset(openOptions.getOffset()));
    }
    catch (UncheckedExecutionException e) {
      throw AlluxioRuntimeException.from(e.getCause());
    }
    return inputStream;
  }

  /**
   * The metadata of the input streams associated with an under storage file that tracks which input
   * streams are in-use or available. Each input stream is identified by a unique id.
   */
  @ThreadSafe
  private static class StreamIdSet {
    private final Set mInUseStreamIds;
    private final Set mAvailableStreamIds;

    /**
     * Creates a new {@link StreamIdSet}.
     */
    StreamIdSet() {
      mInUseStreamIds = new HashSet<>();
      mAvailableStreamIds = new HashSet<>();
    }

    /**
     * @return a view of the available input stream ids
     */
    synchronized Set availableIds() {
      return Collections.unmodifiableSet(mAvailableStreamIds);
    }

    /**
     * Marks an input stream as acquired.
     *
     * @param id the id of the input stream
     */
    synchronized void acquire(long id) {
      Preconditions.checkArgument(!mInUseStreamIds.contains(id), "%s is already in use", id);
      mAvailableStreamIds.remove(id);
      mInUseStreamIds.add(id);
    }

    synchronized long acquireNewId() {
      while (true) {
        long newId = IdUtils.getRandomNonNegativeLong();
        if (mAvailableStreamIds.contains(newId) || mInUseStreamIds.contains(newId)) {
          // This id already managed, try again.
          continue;
        }

        acquire(newId);
        return newId;
      }
    }

    /**
     * @return if there is any outstanding input streams of the file
     */
    synchronized boolean isEmpty() {
      return mInUseStreamIds.isEmpty() && mAvailableStreamIds.isEmpty();
    }

    /**
     * Marks an input stream as not in use.
     * @param id the id of the input stream
     * @return true if removed from the in-use streams
     */
    synchronized boolean removeInUse(long id) {
      return mInUseStreamIds.remove(id);
    }

    /**
     * Removes the mark of the input stream as available.
     *
     * @param id the id of the input stream
     * @return true if the given input stream is available, false if the
     *         given input stream is not among available input streams
     */
    synchronized boolean removeAvailable(long id) {
      return mAvailableStreamIds.remove(id);
    }

    /**
     * Returns an id to the input stream pool. It marks the id from in use to available. If the
     * input stream is already removed from the cache, then do nothing.
     *
     * @param id id of the input stream
     * @return true if the id is marked from in use to available; false if the id no longer used for
     *         cache
     */
    synchronized boolean release(long id) {
      Preconditions.checkArgument(!mAvailableStreamIds.contains(id));
      if (mInUseStreamIds.contains(id)) {
        mInUseStreamIds.remove(id);
        mAvailableStreamIds.add(id);
        return true;
      }
      return false;
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy