All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.state.gemini.engine.filecache.InfiniteCapacityFileCache Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.state.gemini.engine.filecache;

import org.apache.flink.runtime.state.gemini.engine.GRegionContext;
import org.apache.flink.runtime.state.gemini.engine.dbms.GContext;
import org.apache.flink.runtime.state.gemini.engine.exceptions.GeminiRuntimeException;
import org.apache.flink.runtime.state.gemini.engine.fs.FileManager;
import org.apache.flink.runtime.state.gemini.engine.fs.FileReader;
import org.apache.flink.runtime.state.gemini.engine.fs.FileWriter;
import org.apache.flink.runtime.state.gemini.engine.metrics.FileCacheMetrics;
import org.apache.flink.runtime.state.gemini.engine.page.DataPage;
import org.apache.flink.runtime.state.gemini.engine.page.PageAddress;
import org.apache.flink.runtime.state.gemini.engine.page.compress.GCompressAlgorithm;
import org.apache.flink.runtime.state.gemini.engine.rm.ReferenceCount.ReleaseType;
import org.apache.flink.util.Preconditions;

import org.apache.flink.shaded.netty4.io.netty.util.concurrent.EventExecutor;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.BiConsumer;

/**
 * An implementation of {@link FileCache} with infinite capacity. Pages
 * via {@link #addPage} will never been flushed to destination storage.
 */
public class InfiniteCapacityFileCache extends FileCache {

	private static final Logger LOG = LoggerFactory.getLogger(InfiniteCapacityFileCache.class);

	private final GContext gContext;

	private final long maxFileSize;

	/**
	 * File manager for cache.
	 */
	private final FileManager localFileManager;

	/**
	 * Each event executor has it's own file writer.
	 */
	private Map localFileWriters;

	/**
	 * File manager for destination storage.
	 */
	private final FileManager dfsFileManager;

	/**
	 * Each event executor has it's own file writer.
	 */
	private Map dfsFileWriters;

	private volatile boolean closed;

	public InfiniteCapacityFileCache(
		GContext gContext,
		FileManager localFileManager,
		FileManager dfsFileManager
		) {
		super(Long.MAX_VALUE, new FileCacheStat());

		this.gContext = Preconditions.checkNotNull(gContext);
		this.maxFileSize = gContext.getGConfiguration().getMaxLogStructureFileSize();
		Preconditions.checkArgument(maxFileSize > 0, "Max file size should be positive");

		this.localFileManager = Preconditions.checkNotNull(localFileManager);
		this.dfsFileManager = Preconditions.checkNotNull(dfsFileManager);

		this.localFileWriters = new ConcurrentHashMap<>();
		this.dfsFileWriters = new ConcurrentHashMap<>();

		FileCacheMetrics fileCacheMetrics = gContext.getFileCacheMetrics();
		if (fileCacheMetrics != null) {
			fileCacheMetrics.register(fileCacheStat);
		}

		this.closed = false;

		LOG.info("InfiniteCapacityFileCache created, LocalFileManager {}, DfsFileManager {}", localFileManager, dfsFileManager);
	}

	@Override
	public boolean isCached(PageAddress pageAddress) {
		return pageAddress.isLocalValid();
	}

	@Override
	public void addPage(
		PageAddress pageAddress,
		GRegionContext gRegionContext,
		EventExecutor flushEventExecutor,
		BiConsumer callBack
	) {
		if (pageAddress.isLocalValid()) {
			if (callBack != null) {
				callBack.accept(true, null);
			}
			return;
		}
		//add reference before thread runs.
		final DataPage oriDataPage = pageAddress.getDataPage();
		flushEventExecutor.submit(() -> {
			boolean success = false;
			Throwable throwable = null;
			DataPage dataPage = oriDataPage;
			try {
				if (!pageAddress.isLocalValid()) {
					if (dataPage == null && pageAddress.isDfsValid()) {
						// for infinite cache, the if will be true in the following case
						// 1. local recovery is enabled
						// 2. restore from a cp located on the dfs
						// 3. a local snapshot is started, but some pages are neither in memory and local
						dataPage = getDataPage(dfsFileManager, gRegionContext,
							pageAddress::getDfsAddress, pageAddress, false);
					}
					if (dataPage != null) {
						FileWriter fileWriter = getOrCreateFileWriter(localFileWriters,
							localFileManager,
							flushEventExecutor);
						internalAddPage(localFileManager, fileWriter, pageAddress, dataPage, gRegionContext, true, true);
					} else {
						throw new GeminiRuntimeException("data page does not exist");
					}
				}
				success = true;
			} catch (Exception e) {
				success = false;
				throwable = e;
				LOG.error("error when adding page to cache: {}", e);
			} finally {
				if (dataPage != null) {
					dataPage.delReferenceCount(ReleaseType.Normal);
				}
				if (callBack != null) {
					callBack.accept(success, throwable);
				}
			}
		});
	}

	@Override
	public DataPage getPage(
		PageAddress pageAddress,
		GRegionContext gRegionContext,
		EventExecutor flushEventExecutor) {
		try {
			DataPage dataPage = null;
			if (pageAddress.isLocalValid()) {
				dataPage = getDataPage(localFileManager, gRegionContext,
					pageAddress::getLocalAddress, pageAddress, true);
				if (dataPage != null) {
					fileCacheStat.addHitSize(dataPage.getSize());
				}
			} else if (pageAddress.isDfsValid()) {
				dataPage = getDataPage(dfsFileManager, gRegionContext,
					pageAddress::getDfsAddress, pageAddress, false);
				if (dataPage != null) {
					// after restore, we need to cache the data
					final DataPage cacheDataPage = dataPage;
					cacheDataPage.addReferenceCount();
					flushEventExecutor.submit(() -> {
							try {
								FileWriter fileWriter = getOrCreateFileWriter(localFileWriters, localFileManager, flushEventExecutor);
								internalAddPage(localFileManager,
									fileWriter,
									pageAddress,
									cacheDataPage,
									gRegionContext,
									true,
									true);
							} catch (Exception e) {
								LOG.error("cache data failed, {}", e);
							} finally {
								cacheDataPage.delReferenceCount(ReleaseType.Normal);
							}
						}
					);
					fileCacheStat.addMissSize(dataPage.getSize());
				}
			}
			Preconditions.checkNotNull(dataPage, "no page exists on local and dfs");
			return dataPage;
		} catch (Exception e) {
			LOG.error("exception when get page, {}", e);
			throw new GeminiRuntimeException("exception when get page", e);
		}
	}

	@Override
	public void discardPage(
		PageAddress pageAddress,
		GRegionContext gRegionContext,
		EventExecutor eventExecutor) {
		boolean pageValid;
		boolean localValid = false;
		long localAddress = -1;
		boolean dfsValid = false;
		long dfsAddress = -1;
		synchronized (pageAddress) {
			pageValid = pageAddress.isPageValid();
			if (pageValid) {
				pageAddress.setPageStatus(false);
				localValid = pageAddress.isLocalValid();
				dfsValid = pageAddress.isDfsValid();
				if (localValid) {
					localAddress = pageAddress.getLocalAddress();
				}
				if (dfsValid) {
					dfsAddress = pageAddress.getDfsAddress();
				}
			}
		}
		if (pageValid) {
			long accessNumber = gContext.getAccessNumber();
			long ts = System.currentTimeMillis();
			// decrement reference may lead to delete file, but it may be used somewhere
			// 1. read by main thread
			//    In this case, we use access number to delay the deletion.
			// 2. read by split or compaction
			//    generally discardPage is called in region thread, and it will ensure the page
			//    will not be used by split and compaction after discard
			if (localValid) {
				localFileManager.decDBReference(
						localAddress,
						accessNumber,
						ts,
						pageAddress.getDataLen());
			}
			if (dfsValid) {
				dfsFileManager.decDBReference(
						dfsAddress,
						accessNumber,
						ts,
						pageAddress.getDataLen());
			}
		}
	}

	@Override
	public void flushPage(
		PageAddress pageAddress,
		GRegionContext gRegionContext,
		EventExecutor eventExecutor,
		boolean force,
		BiConsumer callBack) {
		if (!force && pageAddress.isDfsValid()) {
			if (callBack != null) {
				callBack.accept(true, null);
			}
			return;
		}

		//add reference before thread runs.
		final DataPage oriDataPage = pageAddress.getDataPage();
		eventExecutor.submit(() -> {
			boolean success = false;
			Throwable throwable = null;
			boolean pageIsNull = false;
			DataPage dataPage = oriDataPage;
			try {
				// recheck whether to flush
				if (force || !pageAddress.isDfsValid()) {
					if (dataPage == null) {
						pageIsNull = true;
						if (pageAddress.isLocalValid()) {
							dataPage = getDataPage(localFileManager, gRegionContext,
								pageAddress::getLocalAddress, pageAddress, true);
						} else if (pageAddress.isDfsValid()) {
							// this may happen when data migration or snapshot for the first time after rescale
							dataPage = getDataPage(dfsFileManager, gRegionContext,
								pageAddress::getDfsAddress, pageAddress, false);

							if (dataPage != null) {
								//TODO why null?
								// after restore, we need to cache the data
								final DataPage cacheDataPage = dataPage;
								cacheDataPage.addReferenceCount();
								eventExecutor.submit(() -> {
										try {
											//TODO here actually need flushEvent to write data to localCacheDisk.
											FileWriter fileWriter = getOrCreateFileWriter(localFileWriters, localFileManager, eventExecutor);
											internalAddPage(localFileManager,
												fileWriter,
												pageAddress,
												cacheDataPage,
												gRegionContext,
												true,
												true);
										} catch (Exception e) {
											LOG.error("cache data failed, {}", e);
										} finally {
											cacheDataPage.delReferenceCount(ReleaseType.Normal);
										}
									}
								);
							}
						}
					}
					Preconditions.checkNotNull(dataPage, "Data page is null");
					FileWriter fileWriter = getOrCreateFileWriter(dfsFileWriters, dfsFileManager, eventExecutor);
					internalAddPage(dfsFileManager, fileWriter, pageAddress, dataPage, gRegionContext, false, false);
				}
				success = true;
			} catch (Exception e) {
				success = false;
				throwable = e;
				LOG.error("error when adding page to cache: pageIsNull={}, {}", pageIsNull, e.getMessage(), e);
			} finally {
				if (dataPage != null) {
					dataPage.delReferenceCount(ReleaseType.Normal);
				}
				if (callBack != null) {
					callBack.accept(success, throwable);
				}
			}
		});
	}

	@Override
	public void sync(EventExecutor eventExecutor) throws IOException {
		FileWriter fileWriter = dfsFileWriters.get(eventExecutor);
		if (fileWriter != null) {
			// file writer guarantee the thread safe for sync
			fileWriter.sync();
		}
	}

	@Override
	public void close() throws IOException {
		synchronized (this) {
			if (closed) {
				LOG.warn("NoCapacityFileCache has been closed");
				return;
			}
			closed = true;
		}

		// DB should guarantee write will not happen after close is called.
		for (FileWriter fileWriter : localFileWriters.values()) {
			localFileManager.closeFileWriter(fileWriter);
		}
		localFileWriters.clear();

		for (FileWriter fileWriter : dfsFileWriters.values()) {
			dfsFileManager.closeFileWriter(fileWriter);
		}
		dfsFileWriters.clear();
		LOG.info("InfiniteCapacityFileCache is closed");
	}

	private void internalAddPage(
		FileManager fileManager,
		FileWriter fileWriter,
		PageAddress pageAddress,
		DataPage dataPage,
		GRegionContext gRegionContext,
		boolean isLocal,
		boolean flushForce
	) throws Exception {
		long address;
		long startTime = System.nanoTime();
		int diskDataLen;
		int numRetires = 0;
		while (true) {
			try {
				address = fileWriter.getAddress();
				diskDataLen = dataPage.write(fileWriter,
					gRegionContext.getPageSerdeFlink(),
					pageAddress,
					isLocal ? gRegionContext.getGContext().getFlushWholePageGCompressAlgorithm() : GCompressAlgorithm.None,
					gRegionContext.getGContext().getGConfiguration().isChecksumEnable());
				fileWriter.resetFailCount();
				break;
			} catch (Exception e) {
				//TODO: #SR add a filter or something else for exceptions.
				fileWriter.increasFailCount();
				++numRetires;
				if (numRetires > 3) {
					LOG.error("internal add page exception: {}, {}, {}", fileWriter, pageAddress, e);
					throw e;
				}
			}
		}

		updateWriteStat(diskDataLen, dataPage.getSize(), System.nanoTime() - startTime, isLocal);
		if (flushForce) {
			fileWriter.flush();
		}

		// add the reference for the file used by new address before we check the PageAddress status
		fileManager.incDBReference(address, pageAddress.getDataLen());

		boolean pageValid;
		boolean hasOldAddress = false;
		long oldAddress = 0;
		synchronized (pageAddress) {
			//TODO DFS now not support compress. HAVE BUG.
			pageAddress.afterFlush(diskDataLen, gRegionContext.getGContext().getFlushWholePageGCompressAlgorithm());
			pageValid = pageAddress.isPageValid();
			if (pageValid) {
				hasOldAddress = isLocal ? pageAddress.isLocalValid() : pageAddress.isDfsValid();
				oldAddress = isLocal ? pageAddress.getLocalAddress() : pageAddress.getDfsAddress();
			}
			// we always set the address no matter whether the page is discarded
			if (isLocal) {
				pageAddress.setLocalAddress(address);
				pageAddress.setLocalStatus(true);
			} else {
				pageAddress.setDfsAddress(address);
				pageAddress.setDfsStatus(true);
			}
		}

		if (pageValid) {
			// if old address exists, we need to dereference the file it used
			if (hasOldAddress) {
				// decrement reference may lead to delete file, but it may be used somewhere
				// 1. read by main thread
				//    In this case, we use access number to delay the deletion. Because this page
				//    is not discarded, so it's important to set new address before getting access
				//    number so that old address will not be used by next access.
				// 2. read by split or compaction
				//    In this case, we will retry in getPage so that we can use the new address
				//    to read data again
				fileManager.decDBReference(
					oldAddress,
					gContext.getAccessNumber(),
					System.currentTimeMillis(),
					pageAddress.getDataLen());
			}
		} else {
			// if page is not valid, we should deference file used by the new address.
			fileManager.decDBReference(
				address,
				gContext.getAccessNumber(),
				System.currentTimeMillis(),
				pageAddress.getDataLen());
		}
	}

	private DataPage getDataPage(
		FileManager fileManager,
		GRegionContext gRegionContext,
		Callable addressCallable,
		PageAddress pageAddress,
		boolean isLocal
	) throws Exception {
		int unexpectedTries = 0;
		int expectedTries = 0;
		DataPage dataPage;
		long address = addressCallable.call();
		while (true) {
			try {
				FileReader fileReader = fileManager.getFileReader(address);
				long offset = fileManager.getFileOffset(address);
				long startTime = System.nanoTime();
				dataPage = fileManager.getDataPageUtil().getDataPageFromReader(gRegionContext.getPageSerdeFlink(),
					fileReader,
					(int) offset,
					pageAddress);
				updateReadStat(pageAddress.getOnDiskDataLen(), System.nanoTime() - startTime, isLocal);
				return dataPage;
			} catch (Exception e) {
				long oldAddress = address;
				// the address may be replaced, and we should update it every time
				address = addressCallable.call();
				// only when the address is updated, we increment the number of retry
				if (oldAddress == address) {
					unexpectedTries += 1;
				} else {
					expectedTries += 1;
				}
				if (unexpectedTries >= 3 || expectedTries >= 10) {
					LOG.error("get page failed, try " + unexpectedTries +
						" times unexpectedly, and try " + expectedTries + " times expectedly, last exception " + e);
					throw e;
				}
			}
		}
	}

	/**
	 * This will be executed in the event executor, so file writer for an
	 * event executor will not be created concurrently.
	 */
	private FileWriter getOrCreateFileWriter(
		Map fileWriterMap,
		FileManager fileManager,
		EventExecutor eventExecutor) {
		if (closed) {
			throw new GeminiRuntimeException("InfiniteCapacityFileCache has been closed.");
		}

		FileWriter fileWriter = fileWriterMap.get(eventExecutor);
		if (fileWriter != null && (!fileWriter.isValid() || fileWriter.size() >= maxFileSize)) {
			fileManager.closeFileWriter(fileWriter);
			fileWriterMap.remove(eventExecutor);
			LOG.debug("close file writer {}/{} in {}", fileWriter.getFileID(), fileWriter.isValid(), eventExecutor);
			fileWriter = null;
		}
		if (fileWriter == null) {
			fileWriter = fileManager.createNewFileWriter();
			fileWriterMap.put(eventExecutor, fileWriter);
			LOG.debug("create new file writer {} in {}", fileWriter.getFileID(), eventExecutor);
		}
		return fileWriter;
	}

	private void updateWriteStat(long diskLen, long size, long time, boolean isLocal) {
		if (isLocal) {
			fileCacheStat.addLocalWrite(diskLen, size, time);
		} else {
			fileCacheStat.addDFSWrite(size, time);
		}
	}

	private void updateReadStat(long time, long size, boolean isLocal) {
		if (isLocal) {
			fileCacheStat.addLocalRead(size, time);
		} else {
			fileCacheStat.addDFSRead(size, time);
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy