All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.blob.PermanentBlobCache Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.blob;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.BlobServerOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.MemorySize;
import org.apache.flink.util.FileUtils;
import org.apache.flink.util.Reference;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.io.File;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Timer;
import java.util.TimerTask;

import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * Provides a cache for permanent BLOB files including a per-job ref-counting and a staged cleanup.
 *
 * 

When requesting BLOBs via {@link #getFile(JobID, PermanentBlobKey)}, the cache will first * attempt to serve the file from its local cache. Only if the local cache does not contain the * desired BLOB, it will try to download it from a distributed HA file system (if available) or the * BLOB server. * *

If files for a job are not needed any more, they will enter a staged, i.e. deferred, cleanup. * Files may thus still be be accessible upon recovery and do not need to be re-downloaded. */ public class PermanentBlobCache extends AbstractBlobCache implements JobPermanentBlobService { /** Job reference counters with a time-to-live (TTL). */ @VisibleForTesting static class RefCount { /** Number of references to a job. */ public int references = 0; /** * Timestamp in milliseconds when any job data should be cleaned up (no cleanup for * non-positive values). */ public long keepUntil = -1; } private static final int DEFAULT_SIZE_LIMIT_MB = 100; /** Map to store the number of references to a specific job. */ private final Map jobRefCounters = new HashMap<>(); /** Time interval (ms) to run the cleanup task; also used as the default TTL. */ private final long cleanupInterval; /** Timer task to execute the cleanup at regular intervals. */ private final Timer cleanupTimer; private final BlobCacheSizeTracker blobCacheSizeTracker; @VisibleForTesting public PermanentBlobCache( final Configuration blobClientConfig, final File storageDir, final BlobView blobView, @Nullable final InetSocketAddress serverAddress) throws IOException { this(blobClientConfig, Reference.owned(storageDir), blobView, serverAddress); } @VisibleForTesting public PermanentBlobCache( final Configuration blobClientConfig, final File storageDir, final BlobView blobView, @Nullable final InetSocketAddress serverAddress, BlobCacheSizeTracker blobCacheSizeTracker) throws IOException { this( blobClientConfig, Reference.owned(storageDir), blobView, serverAddress, blobCacheSizeTracker); } /** * Instantiates a new cache for permanent BLOBs which are also available in an HA store. * * @param blobClientConfig global configuration * @param storageDir storage directory for the cached blobs * @param blobView (distributed) HA blob store file system to retrieve files from first * @param serverAddress address of the {@link BlobServer} to use for fetching files from or * {@code null} if none yet * @throws IOException thrown if the (local or distributed) file storage cannot be created or is * not usable */ public PermanentBlobCache( final Configuration blobClientConfig, final Reference storageDir, final BlobView blobView, @Nullable final InetSocketAddress serverAddress) throws IOException { this( blobClientConfig, storageDir, blobView, serverAddress, new BlobCacheSizeTracker(MemorySize.ofMebiBytes(DEFAULT_SIZE_LIMIT_MB).getBytes())); } @VisibleForTesting public PermanentBlobCache( final Configuration blobClientConfig, final Reference storageDir, final BlobView blobView, @Nullable final InetSocketAddress serverAddress, BlobCacheSizeTracker blobCacheSizeTracker) throws IOException { super( blobClientConfig, storageDir, blobView, LoggerFactory.getLogger(PermanentBlobCache.class), serverAddress); // Initializing the clean up task this.cleanupTimer = new Timer(true); this.cleanupInterval = blobClientConfig.getLong(BlobServerOptions.CLEANUP_INTERVAL) * 1000; this.cleanupTimer.schedule( new PermanentBlobCleanupTask(), cleanupInterval, cleanupInterval); this.blobCacheSizeTracker = blobCacheSizeTracker; registerDetectedJobs(); } private void registerDetectedJobs() throws IOException { if (storageDir.deref().exists()) { final Collection jobIds = BlobUtils.listExistingJobs(storageDir.deref().toPath()); final long expiryTimeout = System.currentTimeMillis() + cleanupInterval; for (JobID jobId : jobIds) { registerJobWithExpiry(jobId, expiryTimeout); } } } private void registerJobWithExpiry(JobID jobId, long expiryTimeout) { checkNotNull(jobId); synchronized (jobRefCounters) { final RefCount refCount = jobRefCounters.computeIfAbsent(jobId, ignored -> new RefCount()); refCount.keepUntil = expiryTimeout; } } /** * Registers use of job-related BLOBs. * *

Using any other method to access BLOBs, e.g. {@link #getFile}, is only valid within calls * to registerJob(JobID) and {@link #releaseJob(JobID)}. * * @param jobId ID of the job this blob belongs to * @see #releaseJob(JobID) */ @Override public void registerJob(JobID jobId) { checkNotNull(jobId); synchronized (jobRefCounters) { RefCount ref = jobRefCounters.get(jobId); if (ref == null) { ref = new RefCount(); jobRefCounters.put(jobId, ref); } else { // reset cleanup timeout ref.keepUntil = -1; } ++ref.references; } } /** * Unregisters use of job-related BLOBs and allow them to be released. * * @param jobId ID of the job this blob belongs to * @see #registerJob(JobID) */ @Override public void releaseJob(JobID jobId) { checkNotNull(jobId); synchronized (jobRefCounters) { RefCount ref = jobRefCounters.get(jobId); if (ref == null || ref.references == 0) { log.warn( "improper use of releaseJob() without a matching number of registerJob() calls for jobId " + jobId); return; } --ref.references; if (ref.references == 0) { ref.keepUntil = System.currentTimeMillis() + cleanupInterval; } } } public int getNumberOfReferenceHolders(JobID jobId) { checkNotNull(jobId); synchronized (jobRefCounters) { RefCount ref = jobRefCounters.get(jobId); if (ref == null) { return 0; } else { return ref.references; } } } /** * Returns the path to a local copy of the file associated with the provided job ID and blob * key. * *

We will first attempt to serve the BLOB from the local storage. If the BLOB is not in * there, we will try to download it from the HA store, or directly from the {@link BlobServer}. * * @param jobId ID of the job this blob belongs to * @param key blob key associated with the requested file * @return The path to the file. * @throws java.io.FileNotFoundException if the BLOB does not exist; * @throws IOException if any other error occurs when retrieving the file */ @Override public File getFile(JobID jobId, PermanentBlobKey key) throws IOException { checkNotNull(jobId); return getFileInternal(jobId, key); } /** * Returns the content of the file for the BLOB with the provided job ID the blob key. * *

The method will first attempt to serve the BLOB from the local cache. If the BLOB is not * in the cache, the method will try to download it from the HA store, or directly from the * {@link BlobServer}. * *

Compared to {@code getFile}, {@code readFile} makes sure that the file is fully read in * the same write lock as the file is accessed. This avoids the scenario that the path is * returned as the file is deleted concurrently by other threads. * * @param jobId ID of the job this blob belongs to * @param blobKey BLOB key associated with the requested file * @return The content of the BLOB. * @throws java.io.FileNotFoundException if the BLOB does not exist; * @throws IOException if any other error occurs when retrieving the file. */ @Override public byte[] readFile(JobID jobId, PermanentBlobKey blobKey) throws IOException { checkNotNull(jobId); checkNotNull(blobKey); final File localFile = BlobUtils.getStorageLocation(storageDir.deref(), jobId, blobKey); readWriteLock.readLock().lock(); try { if (localFile.exists()) { blobCacheSizeTracker.update(jobId, blobKey); return FileUtils.readAllBytes(localFile.toPath()); } } finally { readWriteLock.readLock().unlock(); } // first try the distributed blob store (if available) // use a temporary file (thread-safe without locking) File incomingFile = createTemporaryFilename(); try { try { if (blobView.get(jobId, blobKey, incomingFile)) { // now move the temp file to our local cache atomically readWriteLock.writeLock().lock(); try { checkLimitAndMoveFile(incomingFile, jobId, blobKey, localFile, log, null); return FileUtils.readAllBytes(localFile.toPath()); } finally { readWriteLock.writeLock().unlock(); } } } catch (Exception e) { log.info( "Failed to copy from blob store. Downloading from BLOB server instead.", e); } final InetSocketAddress currentServerAddress = serverAddress; if (currentServerAddress != null) { // fallback: download from the BlobServer BlobClient.downloadFromBlobServer( jobId, blobKey, incomingFile, currentServerAddress, blobClientConfig, numFetchRetries); readWriteLock.writeLock().lock(); try { checkLimitAndMoveFile(incomingFile, jobId, blobKey, localFile, log, null); return FileUtils.readAllBytes(localFile.toPath()); } finally { readWriteLock.writeLock().unlock(); } } else { throw new IOException( "Cannot download from BlobServer, because the server address is unknown."); } } finally { // delete incomingFile from a failed download if (!incomingFile.delete() && incomingFile.exists()) { log.warn( "Could not delete the staging file {} for blob key {} and job {}.", incomingFile, blobKey, jobId); } } } private void checkLimitAndMoveFile( File incomingFile, JobID jobId, BlobKey blobKey, File localFile, Logger log, @Nullable BlobStore blobStore) throws IOException { // Check the size limit and delete the files that exceeds the limit final long sizeOfIncomingFile = incomingFile.length(); final List> blobsToDelete = blobCacheSizeTracker.checkLimit(sizeOfIncomingFile); for (Tuple2 key : blobsToDelete) { if (deleteFile(key.f0, key.f1)) { blobCacheSizeTracker.untrack(key); } } // Move the file and register it to the tracker BlobUtils.moveTempFileToStore(incomingFile, jobId, blobKey, localFile, log, blobStore); blobCacheSizeTracker.track(jobId, blobKey, localFile.length()); } /** * Delete the blob file with the given key. * * @param jobId ID of the job this blob belongs to (or null if job-unrelated) * @param blobKey The key of the desired BLOB. */ private boolean deleteFile(JobID jobId, BlobKey blobKey) { final File localFile = new File( BlobUtils.getStorageLocationPath( storageDir.deref().getAbsolutePath(), jobId, blobKey)); if (!localFile.delete() && localFile.exists()) { log.warn( "Failed to delete locally cached BLOB {} at {}", blobKey, localFile.getAbsolutePath()); return false; } return true; } /** * Returns a file handle to the file associated with the given blob key on the blob server. * * @param jobId ID of the job this blob belongs to (or null if job-unrelated) * @param key identifying the file * @return file handle to the file * @throws IOException if creating the directory fails */ @VisibleForTesting public File getStorageLocation(JobID jobId, BlobKey key) throws IOException { checkNotNull(jobId); return BlobUtils.getStorageLocation(storageDir.deref(), jobId, key); } /** * Returns the job reference counters - for testing purposes only! * * @return job reference counters (internal state!) */ @VisibleForTesting Map getJobRefCounters() { return jobRefCounters; } /** * Cleanup task which is executed periodically to delete BLOBs whose ref-counter reached * 0. */ class PermanentBlobCleanupTask extends TimerTask { /** Cleans up BLOBs which are not referenced anymore. */ @Override public void run() { synchronized (jobRefCounters) { Iterator> entryIter = jobRefCounters.entrySet().iterator(); final long currentTimeMillis = System.currentTimeMillis(); while (entryIter.hasNext()) { Map.Entry entry = entryIter.next(); RefCount ref = entry.getValue(); if (ref.references <= 0 && ref.keepUntil > 0 && currentTimeMillis >= ref.keepUntil) { JobID jobId = entry.getKey(); final File localFile = new File( BlobUtils.getStorageLocationPath( storageDir.deref().getAbsolutePath(), jobId)); /* * NOTE: normally it is not required to acquire the write lock to delete the job's * storage directory since there should be no one accessing it with the ref * counter being 0 - acquire it just in case, to always be on the safe side */ readWriteLock.writeLock().lock(); boolean success = false; try { blobCacheSizeTracker.untrackAll(jobId); FileUtils.deleteDirectory(localFile); success = true; } catch (Throwable t) { log.warn( "Failed to locally delete job directory " + localFile.getAbsolutePath(), t); } finally { readWriteLock.writeLock().unlock(); } // let's only remove this directory from cleanup if the cleanup was // successful // (does not need the write lock) if (success) { entryIter.remove(); } } } } } } @Override protected void cancelCleanupTask() { cleanupTimer.cancel(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy