All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.state.gemini.engine.filecache.InfiniteFileCache Maven / Gradle / Ivy

There is a newer version: 1.5.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.state.gemini.engine.filecache;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.runtime.state.gemini.engine.GRegionContext;
import org.apache.flink.runtime.state.gemini.engine.dbms.GContext;
import org.apache.flink.runtime.state.gemini.engine.exceptions.GeminiRuntimeException;
import org.apache.flink.runtime.state.gemini.engine.filecompaction.FileCompactionPageTransfer;
import org.apache.flink.runtime.state.gemini.engine.fs.FileManager;
import org.apache.flink.runtime.state.gemini.engine.fs.FileReader;
import org.apache.flink.runtime.state.gemini.engine.fs.FileWriter;
import org.apache.flink.runtime.state.gemini.engine.metrics.FileCacheMetrics;
import org.apache.flink.runtime.state.gemini.engine.page.PageAddress;
import org.apache.flink.runtime.state.gemini.engine.page.PageAddressSingleImpl;
import org.apache.flink.runtime.state.gemini.engine.rm.GByteBuffer;
import org.apache.flink.util.Preconditions;

import org.apache.flink.shaded.netty4.io.netty.util.concurrent.EventExecutor;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.BiConsumer;

/**
 * An implementation of {@link FileCache} with infinite capacity. Pages
 * via {@link #addPage} will never been flushed to destination storage.
 */
public class InfiniteFileCache extends FileCache implements FileCompactionPageTransfer {

	private static final Logger LOG = LoggerFactory.getLogger(InfiniteFileCache.class);

	private final GContext gContext;

	private final long maxFileSize;

	private final boolean syncWhenBatchFlush;

	/**
	 * File manager for cache.
	 */
	private final FileManager localFileManager;

	/**
	 * Each event executor has it's own file writer.
	 */
	private Map localFileWriters;

	/**
	 * File manager for destination storage.
	 */
	private final FileManager dfsFileManager;

	/**
	 * Each event executor has it's own file writer.
	 */
	private Map dfsFileWriters;

	private volatile boolean closed;

	public InfiniteFileCache(GContext gContext, FileManager localFileManager, FileManager dfsFileManager) {
		super(Long.MAX_VALUE, new FileCacheStat());

		this.gContext = Preconditions.checkNotNull(gContext);
		this.maxFileSize = gContext.getGConfiguration().getMaxFileSize();
		Preconditions.checkArgument(maxFileSize > 0, "Max file size should be positive");
		this.syncWhenBatchFlush = gContext.getGConfiguration().isSnapshotSyncWhenBatchFlush();

		this.localFileManager = Preconditions.checkNotNull(localFileManager);
		this.dfsFileManager = Preconditions.checkNotNull(dfsFileManager);

		this.localFileWriters = new ConcurrentHashMap<>();
		this.dfsFileWriters = new ConcurrentHashMap<>();

		FileCacheMetrics fileCacheMetrics = gContext.getFileCacheMetrics();
		if (fileCacheMetrics != null) {
			fileCacheMetrics.register(fileCacheStat);
		}

		this.closed = false;

		LOG.info("InfiniteFileCache created, LocalFileManager {}, DfsFileManager {}", localFileManager, dfsFileManager);
	}

	@VisibleForTesting
	public FileManager getLocalFileManager() {
		return localFileManager;
	}

	@VisibleForTesting
	public FileManager getDfsFileManager() {
		return dfsFileManager;
	}

	// implementation for file cache =======================================================

	@Override
	public boolean isCached(PageAddress pageAddress) {
		return pageAddress.isLocalValid();
	}

	@Override
	public void addPage(
		PageAddress pageAddress,
		GRegionContext gRegionContext,
		EventExecutor flushEventExecutor,
		BiConsumer callBack) {
		Preconditions.checkArgument(pageAddress instanceof PageAddressSingleImpl);
		if (pageAddress.isLocalValid()) {
			if (callBack != null) {
				callBack.accept(true, null);
			}
			return;
		}

		flushEventExecutor.execute(() -> {
			boolean success = false;
			Throwable throwable = null;
			GByteBuffer gByteBuffer = pageAddress.getGByteBufferWithReference();
			try {
				if (!pageAddress.isLocalValid()) {
					if (gByteBuffer == null && pageAddress.isDfsValid()) {
						// for infinite cache, the if will be true in the following case
						// 1. local recovery is enabled
						// 2. restore from a cp located on the dfs
						// 3. a local snapshot is started, but some pages are neither in memory and local
						gByteBuffer = getGByteBuffer(dfsFileManager,
							pageAddress::getDfsAddress,
							pageAddress,
							false);
					}
					if (gByteBuffer != null) {
						FileWriter fileWriter = getOrCreateFileWriter(localFileWriters,
							localFileManager,
							flushEventExecutor);
						internalAddPage(localFileManager,
							fileWriter,
							pageAddress,
							gByteBuffer,
							gRegionContext,
							true,
							true);
					} else {
						throw new GeminiRuntimeException("data page does not exist");
					}
				}
				success = true;
			} catch (Exception e) {
				success = false;
				throwable = e;
				LOG.error("error when adding page to cache", e);
			} finally {
				if (gByteBuffer != null) {
					gByteBuffer.release();
				}
				if (callBack != null) {
					callBack.accept(success, throwable);
				}
			}
		});
	}

	@Override
	public GByteBuffer getPage(
		PageAddress pageAddress, GRegionContext gRegionContext, EventExecutor flushEventExecutor) {
		GByteBuffer gByteBuffer = null;
		try {
			Preconditions.checkArgument(pageAddress instanceof PageAddressSingleImpl);
			if (pageAddress.isLocalValid()) {
				gByteBuffer = getGByteBuffer(localFileManager,
					pageAddress::getLocalAddress,
					pageAddress,
					true);
				if (gByteBuffer != null) {
					fileCacheStat.addHitSize(pageAddress.getDataLen());
				}
			} else if (pageAddress.isDfsValid()) {
				gByteBuffer = getGByteBuffer(dfsFileManager, pageAddress::getDfsAddress, pageAddress, false);
				if (gByteBuffer != null) {
					cachePage(pageAddress, gByteBuffer, flushEventExecutor, gRegionContext);
					fileCacheStat.addMissSize(pageAddress.getDataLen());
				}
			}
			Preconditions.checkNotNull(gByteBuffer, "no page exists on local and dfs");
			return gByteBuffer;
		} catch (Exception e) {
			if (gByteBuffer != null) {
				gByteBuffer.release();
			}
			LOG.error("exception when get page", e);
			throw new GeminiRuntimeException("exception when get page: " + e.getMessage(), e);
		}
	}

	@Override
	public void discardPage(
		PageAddress pageAddress, GRegionContext gRegionContext, EventExecutor eventExecutor) {
		Preconditions.checkArgument(pageAddress instanceof PageAddressSingleImpl);
		boolean pageValid;
		boolean localValid = false;
		long localAddress = -1;
		boolean dfsValid = false;
		long dfsAddress = -1;
		synchronized (pageAddress) {
			pageValid = pageAddress.isPageValid();
			if (pageValid) {
				pageAddress.setPageStatus(false);
				localValid = pageAddress.isLocalValid();
				dfsValid = pageAddress.isDfsValid();
				if (localValid) {
					localAddress = pageAddress.getLocalAddress();
				}
				if (dfsValid) {
					dfsAddress = pageAddress.getDfsAddress();
				}
			}
		}
		if (pageValid) {
			long accessNumber = gContext.getAccessNumber();
			long ts = System.currentTimeMillis();
			// decrement reference may lead to delete file, but it may be used somewhere
			// 1. read by main thread
			//    In this case, we use access number to delay the deletion.
			// 2. read by split or compaction
			//    generally discardPage is called in region thread, and it will ensure the page
			//    will not be used by split and compaction after discard
			if (localValid) {
				localFileManager.decDBReference(localAddress, accessNumber, ts, pageAddress.getDataLen());
			}
			if (dfsValid) {
				dfsFileManager.decDBReference(dfsAddress, accessNumber, ts, pageAddress.getDataLen());
			}
		}
	}

	/**
	 * FIXME This method don't guarantee that data can be read immediately
	 * after address is updated, and use {@link #flushBatchPages) instead.
	 */
	@Override
	public void flushPage(
		PageAddress pageAddress,
		GRegionContext gRegionContext,
		EventExecutor eventExecutor,
		boolean force,
		BiConsumer callBack) {
		Preconditions.checkArgument(pageAddress instanceof PageAddressSingleImpl);
		if (!force && pageAddress.isDfsValid()) {
			if (callBack != null) {
				callBack.accept(true, null);
			}
			return;
		}

		eventExecutor.execute(() -> {
			boolean success = false;
			Throwable throwable = null;
			boolean pageIsNull = false;
			GByteBuffer gByteBuffer = pageAddress.getGByteBufferWithReference();
			try {
				// recheck whether to flush
				if (force || !pageAddress.isDfsValid()) {
					if (gByteBuffer == null) {
						pageIsNull = true;
						if (pageAddress.isLocalValid()) {
							gByteBuffer = getGByteBuffer(localFileManager,
								pageAddress::getLocalAddress,
								pageAddress,
								true);
						} else if (pageAddress.isDfsValid()) {
							// this may happen when snapshot for the first time after rescale
							gByteBuffer = getGByteBuffer(dfsFileManager,
								pageAddress::getDfsAddress,
								pageAddress,
								false);

							// TODO page is not cached to local here because eventExecutor used here is
							// usually snapshotEventExecutor not flushEventExecutor, but snapshot
							// eventExecutor will not be used in normal flush, so files created by
							// executor will not be written after this cache, and can't be closed
							// (file may be not full), that's file resources can't be released.
							// So here we don't cache data currently, and rely on file download when
							// restoring
							// cachePage(pageAddress, gByteBuffer, eventExecutor, gRegionContext);
						}
					}
					Preconditions.checkNotNull(gByteBuffer, "Data page is null");
					FileWriter fileWriter = getOrCreateFileWriter(dfsFileWriters, dfsFileManager, eventExecutor);
					internalAddPage(dfsFileManager, fileWriter, pageAddress, gByteBuffer,
						gRegionContext, false, false);
				}
				success = true;
			} catch (Exception e) {
				success = false;
				throwable = e;
				LOG.error("error when adding page to cache: pageIsNull={}, {}", pageIsNull, e.getMessage(), e);
			} finally {
				if (gByteBuffer != null) {
					gByteBuffer.release();
				}
				if (callBack != null) {
					callBack.accept(success, throwable);
				}
			}
		});
	}

	@Override
	public void addBatchPages(
		List pages,
		List gRegionContexts,
		EventExecutor eventExecutor,
		List> callBacks) {

		if (pages.isEmpty()) {
			return;
		}

		eventExecutor.execute(() -> {
			boolean success = true;
			Throwable throwable = null;
			try {
				int size = pages.size();
				// TODO reuse list
				List addressList = new ArrayList<>(size);
				FileWriter fileWriter = getOrCreateFileWriter(localFileWriters, localFileManager, eventExecutor);
				for (int i = 0; i < size; i++) {
					PageAddress page = pages.get(i);
					if (!page.isLocalValid()) {
						GRegionContext gRegionContext = gRegionContexts.get(i);
						GByteBuffer buffer = page.getGByteBufferWithReference();
						try {
							if (buffer == null && page.isDfsValid()) {
								// page is not in memory, and read it from dfs
								buffer = getGByteBuffer(dfsFileManager, page::getDfsAddress, page, false);
							}
							if (buffer != null) {
								// write page to local
								long address = writePage(localFileManager,
									fileWriter,
									page,
									buffer,
									gRegionContext,
									true);
								addressList.add(address);
							} else {
								throw new GeminiRuntimeException("data page does not exist");
							}
						} finally {
							// release buffer as soon as possible
							if (buffer != null) {
								buffer.release();
							}
						}
					} else {
						// a null address indicates there is no need to update page
						addressList.add(null);
					}
				}
				// flush to ensure data can be read immediately after addresses are updated
				fileWriter.flush();

				long accessNumber = gRegionContexts.get(0).getGContext().getAccessNumber();
				// update file references, and there shouldn't have exception happened
				for (int i = 0; i < size; i++) {
					Long address = addressList.get(i);
					// skip those pages that has not a new address
					if (address != null) {
						updatePageAddress(localFileManager, pages.get(i), address, true, accessNumber);
					}
				}
			} catch (Exception e) {
				success = false;
				throwable = new AddBatchPageException(e);
			} finally {
				// execute callbacks for all pages
				for (BiConsumer callBack : callBacks) {
					if (callBack != null) {
						callBack.accept(success, throwable);
					}
				}
			}
		});
	}

	@Override
	public void flushBatchPages(
		List pages,
		List gRegionContexts,
		EventExecutor eventExecutor,
		boolean force,
		boolean flushLocal,
		List> callBacks) {

		if (pages.isEmpty()) {
			return;
		}
		// TODO refactor code to remove duplication with addBatchPages
		eventExecutor.execute(() -> {
			boolean success = true;
			Throwable throwable = null;
			try {
				int size = pages.size();
				// TODO reuse list
				List addressList = new ArrayList<>(size);
				FileWriter fileWriter = getOrCreateFileWriter(dfsFileWriters, dfsFileManager, eventExecutor);
				for (int i = 0; i < size; i++) {
					PageAddress page = pages.get(i);
					if (force || !page.isDfsValid()) {
						GByteBuffer buffer = page.getGByteBufferWithReference();
						boolean inMemory = buffer != null;
						try {
							if (!inMemory) {
								// read page from local or dfs
								if (page.isLocalValid()) {
									buffer = getGByteBuffer(localFileManager, page::getLocalAddress, page, true);
								} else if (page.isDfsValid()) {
									buffer = getGByteBuffer(dfsFileManager, page::getDfsAddress, page, false);
								}
							}
							if (buffer != null) {
								// write page to dfs
								long address = writePage(dfsFileManager,
									fileWriter,
									page,
									buffer,
									gRegionContexts.get(i),
									false);
								addressList.add(address);

								// TODO to fix EOF, we always flush pages to local if it does not exist, and
								// we check this after write dfs because we hope local snapshot has helped us
								// do it, and there is no need to flush it again if local recovery is enable
								// If in memory is true, evict/discard will ensure that local is valid before
								// set data page to null, so there is not need to flush local here
								if (!inMemory && flushLocal && !page.isLocalValid()) {
									FileWriter localFileWriter = getOrCreateFileWriter(localFileWriters,
										localFileManager,
										eventExecutor);
									// this will call outputStream.flush and update local address
									internalAddPage(localFileManager,
										localFileWriter,
										page,
										buffer,
										gRegionContexts.get(i),
										true,
										true);
								}
							} else {
								throw new GeminiRuntimeException("data page does not exist");
							}
						} finally {
							// release buffer as soon as possible
							if (buffer != null) {
								buffer.release();
							}
						}
					} else {
						// a null address indicates there is no need to update page
						addressList.add(null);
					}
				}

				// flush to ensure data can be read immediately after addresses are updated
				if (syncWhenBatchFlush) {
					fileWriter.sync();
				} else {
					fileWriter.flush();
				}
				long accessNumber = gRegionContexts.get(0).getGContext().getAccessNumber();
				// update file references, and there shouldn't have exception happened
				for (int i = 0; i < size; i++) {
					Long address = addressList.get(i);
					// skip those pages that has not a new address
					if (address != null) {
						updatePageAddress(dfsFileManager, pages.get(i), address, false, accessNumber);
					}
				}
			} catch (Exception e) {
				success = false;
				throwable = new FlushBatchPageException(e);
			} finally {
				// execute callbacks for all pages
				for (BiConsumer callBack : callBacks) {
					if (callBack != null) {
						callBack.accept(success, throwable);
					}
				}
			}
		});
	}

	/**
	 * Currently this is only used to sync data when snapshot is finished.
	 */
	@Override
	public void sync() throws IOException {
		// sync dfs data
		for (FileWriter fileWriter : dfsFileWriters.values()) {
			fileWriter.sync();
		}

		// for local snapshot, it's better to sync data for all writers
		for (FileWriter fileWriter : localFileWriters.values()) {
			fileWriter.sync();
		}
	}

	@Override
	public FileCacheType getFileCacheType() {
		return FileCacheType.INFINITE;
	}

	// implementation for page transfer =======================================================

	@Override
	public FileManager getDbFileManager() {
		return localFileManager;
	}

	@Override
	public boolean hasDbFileAddress(PageAddress pageAddress) {
		return pageAddress.isPageValid() && pageAddress.isLocalValid();
	}

	@Override
	public int getDbFileId(PageAddress pageAddress) {
		return localFileManager.getSimpleFileID(pageAddress.getLocalAddress());
	}

	@Override
	public void transferPage(
		PageAddress pageAddress,
		GRegionContext gRegionContext,
		EventExecutor eventExecutor,
		@Nullable BiConsumer callBack) {

		// FIXME because file compaction is not started from region executor,
		// so it's no use to get byte buffer from outer of executor. Do not
		// enable file compaction in off-heap mode
		eventExecutor.execute(() -> {
			boolean success = false;
			Throwable throwable = null;
			GByteBuffer buffer = pageAddress.getGByteBufferWithReference();

			try {
				// TODO there are some work to complete
				// 1. mechanism for data page reference in off-heap mode is not suitable
				// in this case, and this expected to be solved in [BLINK-21500417], so
				// currently file compaction can only be enabled in on-heap mode
				// 2. if page is not in memory, we will load it, build a DataPage and write
				// again. But actually there is no need to build a DataPage, and the input
				// byte steam can be output directly. If compression is enabled, we need do
				// a bit more work, so we will do it after rebase the compression code

				// recheck whether the local address if valid
				if (pageAddress.isPageValid() && pageAddress.isLocalValid()) {
					if (buffer == null) {
						long localAddress = pageAddress.getLocalAddress();
						FileReader fileReader = localFileManager.getFileReader(localAddress);
						long offset = localFileManager.getFileOffset(localAddress);
						buffer = localFileManager.getDataPageUtil().getDataPageFromReader(
							fileReader,
							(int) offset,
							pageAddress);
					}
					FileWriter fileWriter = getOrCreateFileWriter(localFileWriters,
						localFileManager, eventExecutor);
					internalAddPage(localFileManager, fileWriter, pageAddress, buffer,
						gRegionContext, true, true);
					success = true;
				}
			} catch (Exception e) {
				throwable = e;
			} finally {
				if (buffer != null) {
					buffer.release();
				}

				if (callBack != null) {
					callBack.accept(success, throwable);
				}
			}
		});
	}

	@Override
	public void close() throws IOException {
		synchronized (this) {
			if (closed) {
				LOG.warn("NoFileCache has been closed");
				return;
			}
			closed = true;
		}

		// DB should guarantee write will not happen after close is called.
		for (FileWriter fileWriter : localFileWriters.values()) {
			localFileManager.closeFileWriter(fileWriter);
		}
		localFileWriters.clear();

		for (FileWriter fileWriter : dfsFileWriters.values()) {
			dfsFileManager.closeFileWriter(fileWriter);
		}
		dfsFileWriters.clear();
		LOG.info("InfiniteFileCache is closed");
	}

	@VisibleForTesting
	Map getDfsFileWriters() {
		return dfsFileWriters;
	}

	private GByteBuffer getGByteBuffer(
		FileManager fileManager,
		Callable addressCallable,
		PageAddress pageAddress,
		boolean isLocal) throws Exception {
		Preconditions.checkArgument(pageAddress instanceof PageAddressSingleImpl);
		int unexpectedTries = 0;
		int expectedTries = 0;
		GByteBuffer gByteBuffer = null;
		long address = addressCallable.call();
		while (true) {
			try {
				FileReader fileReader = fileManager.getFileReader(address);
				long offset = fileManager.getFileOffset(address);
				long startTime = System.nanoTime();
				gByteBuffer = fileManager.getDataPageUtil().getDataPageFromReader(fileReader,
					(int) offset,
					pageAddress);
				updateReadStat(pageAddress.getDataLen(), System.nanoTime() - startTime, isLocal);
				return gByteBuffer;
			} catch (Exception e) {
				if (gByteBuffer != null) {
					gByteBuffer.release();
				}
				gByteBuffer = null;
				long oldAddress = address;
				// the address may be replaced, and we should update it every time
				address = addressCallable.call();
				// only when the address is updated, we increment the number of retry
				if (oldAddress == address) {
					unexpectedTries += 1;
				} else {
					expectedTries += 1;
				}
				if (unexpectedTries >= 3 || expectedTries >= 10) {
					LOG.error("get page failed, try " + unexpectedTries + " times unexpectedly, and try " +
						expectedTries + " times as expected, last exception", e);
					throw e;
				}
			}
		}
	}

	/**
	 * Page needs to cache to the local in some cases. For example, after
	 * restoring from DFS checkpoint, it's better to cache pages to local
	 * after reading it from DFS.
	 */
	private void cachePage(
		PageAddress page,
		GByteBuffer buffer,
		EventExecutor eventExecutor,
		GRegionContext gRegionContext) {
		buffer.retain();
		eventExecutor.execute(() -> {
			try {
				FileWriter fileWriter = getOrCreateFileWriter(
					localFileWriters,
					localFileManager,
					eventExecutor);
				internalAddPage(localFileManager,
					fileWriter,
					page,
					buffer,
					gRegionContext,
					true,
					true);
			} catch (Exception e) {
				LOG.error("cache data failed", e);
			} finally {
				buffer.release();
			}
		});
	}

	/**
	 * This will be executed in the event executor, so file writer for an
	 * event executor will not be created concurrently.
	 */
	FileWriter getOrCreateFileWriter(
		Map fileWriterMap, FileManager fileManager, EventExecutor eventExecutor) {
		if (closed) {
			throw new GeminiRuntimeException("InfiniteFileCache has been closed.");
		}

		FileWriter fileWriter = fileWriterMap.get(eventExecutor);
		if (fileWriter != null && (!fileWriter.isValid() || fileWriter.size() >= maxFileSize)) {
			fileManager.closeFileWriter(fileWriter);
			fileWriterMap.remove(eventExecutor);
			LOG.debug("close file writer {}/{} in {}", fileWriter.getFileID(), fileWriter.isValid(), eventExecutor);
			fileWriter = null;
		}
		if (fileWriter == null) {
			fileWriter = fileManager.createNewFileWriter();
			fileWriterMap.put(eventExecutor, fileWriter);
			LOG.debug("create new file writer {} in {}", fileWriter.getFileID(), eventExecutor);
		}
		return fileWriter;
	}

	private void updateReadStat(long time, long size, boolean isLocal) {
		if (isLocal) {
			fileCacheStat.addLocalRead(size, time);
		} else {
			fileCacheStat.addDFSRead(size, time);
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy