Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.state.gemini.engine.filecache;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.runtime.state.gemini.engine.GRegionContext;
import org.apache.flink.runtime.state.gemini.engine.dbms.GContext;
import org.apache.flink.runtime.state.gemini.engine.exceptions.GeminiRuntimeException;
import org.apache.flink.runtime.state.gemini.engine.filecompaction.FileCompactionPageTransfer;
import org.apache.flink.runtime.state.gemini.engine.fs.FileManager;
import org.apache.flink.runtime.state.gemini.engine.fs.FileReader;
import org.apache.flink.runtime.state.gemini.engine.fs.FileWriter;
import org.apache.flink.runtime.state.gemini.engine.metrics.FileCacheMetrics;
import org.apache.flink.runtime.state.gemini.engine.page.PageAddress;
import org.apache.flink.runtime.state.gemini.engine.page.PageAddressSingleImpl;
import org.apache.flink.runtime.state.gemini.engine.rm.GByteBuffer;
import org.apache.flink.util.Preconditions;
import org.apache.flink.shaded.netty4.io.netty.util.concurrent.EventExecutor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.BiConsumer;
/**
* An implementation of {@link FileCache} with infinite capacity. Pages
* via {@link #addPage} will never been flushed to destination storage.
*/
public class InfiniteFileCache extends FileCache implements FileCompactionPageTransfer {
private static final Logger LOG = LoggerFactory.getLogger(InfiniteFileCache.class);
private final GContext gContext;
private final long maxFileSize;
private final boolean syncWhenBatchFlush;
/**
* File manager for cache.
*/
private final FileManager localFileManager;
/**
* Each event executor has it's own file writer.
*/
private Map localFileWriters;
/**
* File manager for destination storage.
*/
private final FileManager dfsFileManager;
/**
* Each event executor has it's own file writer.
*/
private Map dfsFileWriters;
private volatile boolean closed;
public InfiniteFileCache(GContext gContext, FileManager localFileManager, FileManager dfsFileManager) {
super(Long.MAX_VALUE, new FileCacheStat());
this.gContext = Preconditions.checkNotNull(gContext);
this.maxFileSize = gContext.getGConfiguration().getMaxFileSize();
Preconditions.checkArgument(maxFileSize > 0, "Max file size should be positive");
this.syncWhenBatchFlush = gContext.getGConfiguration().isSnapshotSyncWhenBatchFlush();
this.localFileManager = Preconditions.checkNotNull(localFileManager);
this.dfsFileManager = Preconditions.checkNotNull(dfsFileManager);
this.localFileWriters = new ConcurrentHashMap<>();
this.dfsFileWriters = new ConcurrentHashMap<>();
FileCacheMetrics fileCacheMetrics = gContext.getFileCacheMetrics();
if (fileCacheMetrics != null) {
fileCacheMetrics.register(fileCacheStat);
}
this.closed = false;
LOG.info("InfiniteFileCache created, LocalFileManager {}, DfsFileManager {}", localFileManager, dfsFileManager);
}
@VisibleForTesting
public FileManager getLocalFileManager() {
return localFileManager;
}
@VisibleForTesting
public FileManager getDfsFileManager() {
return dfsFileManager;
}
// implementation for file cache =======================================================
@Override
public boolean isCached(PageAddress pageAddress) {
return pageAddress.isLocalValid();
}
@Override
public void addPage(
PageAddress pageAddress,
GRegionContext gRegionContext,
EventExecutor flushEventExecutor,
BiConsumer callBack) {
Preconditions.checkArgument(pageAddress instanceof PageAddressSingleImpl);
if (pageAddress.isLocalValid()) {
if (callBack != null) {
callBack.accept(true, null);
}
return;
}
flushEventExecutor.execute(() -> {
boolean success = false;
Throwable throwable = null;
GByteBuffer gByteBuffer = pageAddress.getGByteBufferWithReference();
try {
if (!pageAddress.isLocalValid()) {
if (gByteBuffer == null && pageAddress.isDfsValid()) {
// for infinite cache, the if will be true in the following case
// 1. local recovery is enabled
// 2. restore from a cp located on the dfs
// 3. a local snapshot is started, but some pages are neither in memory and local
gByteBuffer = getGByteBuffer(dfsFileManager,
pageAddress::getDfsAddress,
pageAddress,
false);
}
if (gByteBuffer != null) {
FileWriter fileWriter = getOrCreateFileWriter(localFileWriters,
localFileManager,
flushEventExecutor);
internalAddPage(localFileManager,
fileWriter,
pageAddress,
gByteBuffer,
gRegionContext,
true,
true);
} else {
throw new GeminiRuntimeException("data page does not exist");
}
}
success = true;
} catch (Exception e) {
success = false;
throwable = e;
LOG.error("error when adding page to cache", e);
} finally {
if (gByteBuffer != null) {
gByteBuffer.release();
}
if (callBack != null) {
callBack.accept(success, throwable);
}
}
});
}
@Override
public GByteBuffer getPage(
PageAddress pageAddress, GRegionContext gRegionContext, EventExecutor flushEventExecutor) {
GByteBuffer gByteBuffer = null;
try {
Preconditions.checkArgument(pageAddress instanceof PageAddressSingleImpl);
if (pageAddress.isLocalValid()) {
gByteBuffer = getGByteBuffer(localFileManager,
pageAddress::getLocalAddress,
pageAddress,
true);
if (gByteBuffer != null) {
fileCacheStat.addHitSize(pageAddress.getDataLen());
}
} else if (pageAddress.isDfsValid()) {
gByteBuffer = getGByteBuffer(dfsFileManager, pageAddress::getDfsAddress, pageAddress, false);
if (gByteBuffer != null) {
cachePage(pageAddress, gByteBuffer, flushEventExecutor, gRegionContext);
fileCacheStat.addMissSize(pageAddress.getDataLen());
}
}
Preconditions.checkNotNull(gByteBuffer, "no page exists on local and dfs");
return gByteBuffer;
} catch (Exception e) {
if (gByteBuffer != null) {
gByteBuffer.release();
}
LOG.error("exception when get page", e);
throw new GeminiRuntimeException("exception when get page: " + e.getMessage(), e);
}
}
@Override
public void discardPage(
PageAddress pageAddress, GRegionContext gRegionContext, EventExecutor eventExecutor) {
Preconditions.checkArgument(pageAddress instanceof PageAddressSingleImpl);
boolean pageValid;
boolean localValid = false;
long localAddress = -1;
boolean dfsValid = false;
long dfsAddress = -1;
synchronized (pageAddress) {
pageValid = pageAddress.isPageValid();
if (pageValid) {
pageAddress.setPageStatus(false);
localValid = pageAddress.isLocalValid();
dfsValid = pageAddress.isDfsValid();
if (localValid) {
localAddress = pageAddress.getLocalAddress();
}
if (dfsValid) {
dfsAddress = pageAddress.getDfsAddress();
}
}
}
if (pageValid) {
long accessNumber = gContext.getAccessNumber();
long ts = System.currentTimeMillis();
// decrement reference may lead to delete file, but it may be used somewhere
// 1. read by main thread
// In this case, we use access number to delay the deletion.
// 2. read by split or compaction
// generally discardPage is called in region thread, and it will ensure the page
// will not be used by split and compaction after discard
if (localValid) {
localFileManager.decDBReference(localAddress, accessNumber, ts, pageAddress.getDataLen());
}
if (dfsValid) {
dfsFileManager.decDBReference(dfsAddress, accessNumber, ts, pageAddress.getDataLen());
}
}
}
/**
* FIXME This method don't guarantee that data can be read immediately
* after address is updated, and use {@link #flushBatchPages) instead.
*/
@Override
public void flushPage(
PageAddress pageAddress,
GRegionContext gRegionContext,
EventExecutor eventExecutor,
boolean force,
BiConsumer callBack) {
Preconditions.checkArgument(pageAddress instanceof PageAddressSingleImpl);
if (!force && pageAddress.isDfsValid()) {
if (callBack != null) {
callBack.accept(true, null);
}
return;
}
eventExecutor.execute(() -> {
boolean success = false;
Throwable throwable = null;
boolean pageIsNull = false;
GByteBuffer gByteBuffer = pageAddress.getGByteBufferWithReference();
try {
// recheck whether to flush
if (force || !pageAddress.isDfsValid()) {
if (gByteBuffer == null) {
pageIsNull = true;
if (pageAddress.isLocalValid()) {
gByteBuffer = getGByteBuffer(localFileManager,
pageAddress::getLocalAddress,
pageAddress,
true);
} else if (pageAddress.isDfsValid()) {
// this may happen when snapshot for the first time after rescale
gByteBuffer = getGByteBuffer(dfsFileManager,
pageAddress::getDfsAddress,
pageAddress,
false);
// TODO page is not cached to local here because eventExecutor used here is
// usually snapshotEventExecutor not flushEventExecutor, but snapshot
// eventExecutor will not be used in normal flush, so files created by
// executor will not be written after this cache, and can't be closed
// (file may be not full), that's file resources can't be released.
// So here we don't cache data currently, and rely on file download when
// restoring
// cachePage(pageAddress, gByteBuffer, eventExecutor, gRegionContext);
}
}
Preconditions.checkNotNull(gByteBuffer, "Data page is null");
FileWriter fileWriter = getOrCreateFileWriter(dfsFileWriters, dfsFileManager, eventExecutor);
internalAddPage(dfsFileManager, fileWriter, pageAddress, gByteBuffer,
gRegionContext, false, false);
}
success = true;
} catch (Exception e) {
success = false;
throwable = e;
LOG.error("error when adding page to cache: pageIsNull={}, {}", pageIsNull, e.getMessage(), e);
} finally {
if (gByteBuffer != null) {
gByteBuffer.release();
}
if (callBack != null) {
callBack.accept(success, throwable);
}
}
});
}
@Override
public void addBatchPages(
List pages,
List gRegionContexts,
EventExecutor eventExecutor,
List> callBacks) {
if (pages.isEmpty()) {
return;
}
eventExecutor.execute(() -> {
boolean success = true;
Throwable throwable = null;
try {
int size = pages.size();
// TODO reuse list
List addressList = new ArrayList<>(size);
FileWriter fileWriter = getOrCreateFileWriter(localFileWriters, localFileManager, eventExecutor);
for (int i = 0; i < size; i++) {
PageAddress page = pages.get(i);
if (!page.isLocalValid()) {
GRegionContext gRegionContext = gRegionContexts.get(i);
GByteBuffer buffer = page.getGByteBufferWithReference();
try {
if (buffer == null && page.isDfsValid()) {
// page is not in memory, and read it from dfs
buffer = getGByteBuffer(dfsFileManager, page::getDfsAddress, page, false);
}
if (buffer != null) {
// write page to local
long address = writePage(localFileManager,
fileWriter,
page,
buffer,
gRegionContext,
true);
addressList.add(address);
} else {
throw new GeminiRuntimeException("data page does not exist");
}
} finally {
// release buffer as soon as possible
if (buffer != null) {
buffer.release();
}
}
} else {
// a null address indicates there is no need to update page
addressList.add(null);
}
}
// flush to ensure data can be read immediately after addresses are updated
fileWriter.flush();
long accessNumber = gRegionContexts.get(0).getGContext().getAccessNumber();
// update file references, and there shouldn't have exception happened
for (int i = 0; i < size; i++) {
Long address = addressList.get(i);
// skip those pages that has not a new address
if (address != null) {
updatePageAddress(localFileManager, pages.get(i), address, true, accessNumber);
}
}
} catch (Exception e) {
success = false;
throwable = new AddBatchPageException(e);
} finally {
// execute callbacks for all pages
for (BiConsumer callBack : callBacks) {
if (callBack != null) {
callBack.accept(success, throwable);
}
}
}
});
}
@Override
public void flushBatchPages(
List pages,
List gRegionContexts,
EventExecutor eventExecutor,
boolean force,
boolean flushLocal,
List> callBacks) {
if (pages.isEmpty()) {
return;
}
// TODO refactor code to remove duplication with addBatchPages
eventExecutor.execute(() -> {
boolean success = true;
Throwable throwable = null;
try {
int size = pages.size();
// TODO reuse list
List addressList = new ArrayList<>(size);
FileWriter fileWriter = getOrCreateFileWriter(dfsFileWriters, dfsFileManager, eventExecutor);
for (int i = 0; i < size; i++) {
PageAddress page = pages.get(i);
if (force || !page.isDfsValid()) {
GByteBuffer buffer = page.getGByteBufferWithReference();
boolean inMemory = buffer != null;
try {
if (!inMemory) {
// read page from local or dfs
if (page.isLocalValid()) {
buffer = getGByteBuffer(localFileManager, page::getLocalAddress, page, true);
} else if (page.isDfsValid()) {
buffer = getGByteBuffer(dfsFileManager, page::getDfsAddress, page, false);
}
}
if (buffer != null) {
// write page to dfs
long address = writePage(dfsFileManager,
fileWriter,
page,
buffer,
gRegionContexts.get(i),
false);
addressList.add(address);
// TODO to fix EOF, we always flush pages to local if it does not exist, and
// we check this after write dfs because we hope local snapshot has helped us
// do it, and there is no need to flush it again if local recovery is enable
// If in memory is true, evict/discard will ensure that local is valid before
// set data page to null, so there is not need to flush local here
if (!inMemory && flushLocal && !page.isLocalValid()) {
FileWriter localFileWriter = getOrCreateFileWriter(localFileWriters,
localFileManager,
eventExecutor);
// this will call outputStream.flush and update local address
internalAddPage(localFileManager,
localFileWriter,
page,
buffer,
gRegionContexts.get(i),
true,
true);
}
} else {
throw new GeminiRuntimeException("data page does not exist");
}
} finally {
// release buffer as soon as possible
if (buffer != null) {
buffer.release();
}
}
} else {
// a null address indicates there is no need to update page
addressList.add(null);
}
}
// flush to ensure data can be read immediately after addresses are updated
if (syncWhenBatchFlush) {
fileWriter.sync();
} else {
fileWriter.flush();
}
long accessNumber = gRegionContexts.get(0).getGContext().getAccessNumber();
// update file references, and there shouldn't have exception happened
for (int i = 0; i < size; i++) {
Long address = addressList.get(i);
// skip those pages that has not a new address
if (address != null) {
updatePageAddress(dfsFileManager, pages.get(i), address, false, accessNumber);
}
}
} catch (Exception e) {
success = false;
throwable = new FlushBatchPageException(e);
} finally {
// execute callbacks for all pages
for (BiConsumer callBack : callBacks) {
if (callBack != null) {
callBack.accept(success, throwable);
}
}
}
});
}
/**
* Currently this is only used to sync data when snapshot is finished.
*/
@Override
public void sync() throws IOException {
// sync dfs data
for (FileWriter fileWriter : dfsFileWriters.values()) {
fileWriter.sync();
}
// for local snapshot, it's better to sync data for all writers
for (FileWriter fileWriter : localFileWriters.values()) {
fileWriter.sync();
}
}
@Override
public FileCacheType getFileCacheType() {
return FileCacheType.INFINITE;
}
// implementation for page transfer =======================================================
@Override
public FileManager getDbFileManager() {
return localFileManager;
}
@Override
public boolean hasDbFileAddress(PageAddress pageAddress) {
return pageAddress.isPageValid() && pageAddress.isLocalValid();
}
@Override
public int getDbFileId(PageAddress pageAddress) {
return localFileManager.getSimpleFileID(pageAddress.getLocalAddress());
}
@Override
public void transferPage(
PageAddress pageAddress,
GRegionContext gRegionContext,
EventExecutor eventExecutor,
@Nullable BiConsumer callBack) {
// FIXME because file compaction is not started from region executor,
// so it's no use to get byte buffer from outer of executor. Do not
// enable file compaction in off-heap mode
eventExecutor.execute(() -> {
boolean success = false;
Throwable throwable = null;
GByteBuffer buffer = pageAddress.getGByteBufferWithReference();
try {
// TODO there are some work to complete
// 1. mechanism for data page reference in off-heap mode is not suitable
// in this case, and this expected to be solved in [BLINK-21500417], so
// currently file compaction can only be enabled in on-heap mode
// 2. if page is not in memory, we will load it, build a DataPage and write
// again. But actually there is no need to build a DataPage, and the input
// byte steam can be output directly. If compression is enabled, we need do
// a bit more work, so we will do it after rebase the compression code
// recheck whether the local address if valid
if (pageAddress.isPageValid() && pageAddress.isLocalValid()) {
if (buffer == null) {
long localAddress = pageAddress.getLocalAddress();
FileReader fileReader = localFileManager.getFileReader(localAddress);
long offset = localFileManager.getFileOffset(localAddress);
buffer = localFileManager.getDataPageUtil().getDataPageFromReader(
fileReader,
(int) offset,
pageAddress);
}
FileWriter fileWriter = getOrCreateFileWriter(localFileWriters,
localFileManager, eventExecutor);
internalAddPage(localFileManager, fileWriter, pageAddress, buffer,
gRegionContext, true, true);
success = true;
}
} catch (Exception e) {
throwable = e;
} finally {
if (buffer != null) {
buffer.release();
}
if (callBack != null) {
callBack.accept(success, throwable);
}
}
});
}
@Override
public void close() throws IOException {
synchronized (this) {
if (closed) {
LOG.warn("NoFileCache has been closed");
return;
}
closed = true;
}
// DB should guarantee write will not happen after close is called.
for (FileWriter fileWriter : localFileWriters.values()) {
localFileManager.closeFileWriter(fileWriter);
}
localFileWriters.clear();
for (FileWriter fileWriter : dfsFileWriters.values()) {
dfsFileManager.closeFileWriter(fileWriter);
}
dfsFileWriters.clear();
LOG.info("InfiniteFileCache is closed");
}
@VisibleForTesting
Map getDfsFileWriters() {
return dfsFileWriters;
}
private GByteBuffer getGByteBuffer(
FileManager fileManager,
Callable addressCallable,
PageAddress pageAddress,
boolean isLocal) throws Exception {
Preconditions.checkArgument(pageAddress instanceof PageAddressSingleImpl);
int unexpectedTries = 0;
int expectedTries = 0;
GByteBuffer gByteBuffer = null;
long address = addressCallable.call();
while (true) {
try {
FileReader fileReader = fileManager.getFileReader(address);
long offset = fileManager.getFileOffset(address);
long startTime = System.nanoTime();
gByteBuffer = fileManager.getDataPageUtil().getDataPageFromReader(fileReader,
(int) offset,
pageAddress);
updateReadStat(pageAddress.getDataLen(), System.nanoTime() - startTime, isLocal);
return gByteBuffer;
} catch (Exception e) {
if (gByteBuffer != null) {
gByteBuffer.release();
}
gByteBuffer = null;
long oldAddress = address;
// the address may be replaced, and we should update it every time
address = addressCallable.call();
// only when the address is updated, we increment the number of retry
if (oldAddress == address) {
unexpectedTries += 1;
} else {
expectedTries += 1;
}
if (unexpectedTries >= 3 || expectedTries >= 10) {
LOG.error("get page failed, try " + unexpectedTries + " times unexpectedly, and try " +
expectedTries + " times as expected, last exception", e);
throw e;
}
}
}
}
/**
* Page needs to cache to the local in some cases. For example, after
* restoring from DFS checkpoint, it's better to cache pages to local
* after reading it from DFS.
*/
private void cachePage(
PageAddress page,
GByteBuffer buffer,
EventExecutor eventExecutor,
GRegionContext gRegionContext) {
buffer.retain();
eventExecutor.execute(() -> {
try {
FileWriter fileWriter = getOrCreateFileWriter(
localFileWriters,
localFileManager,
eventExecutor);
internalAddPage(localFileManager,
fileWriter,
page,
buffer,
gRegionContext,
true,
true);
} catch (Exception e) {
LOG.error("cache data failed", e);
} finally {
buffer.release();
}
});
}
/**
* This will be executed in the event executor, so file writer for an
* event executor will not be created concurrently.
*/
FileWriter getOrCreateFileWriter(
Map fileWriterMap, FileManager fileManager, EventExecutor eventExecutor) {
if (closed) {
throw new GeminiRuntimeException("InfiniteFileCache has been closed.");
}
FileWriter fileWriter = fileWriterMap.get(eventExecutor);
if (fileWriter != null && (!fileWriter.isValid() || fileWriter.size() >= maxFileSize)) {
fileManager.closeFileWriter(fileWriter);
fileWriterMap.remove(eventExecutor);
LOG.debug("close file writer {}/{} in {}", fileWriter.getFileID(), fileWriter.isValid(), eventExecutor);
fileWriter = null;
}
if (fileWriter == null) {
fileWriter = fileManager.createNewFileWriter();
fileWriterMap.put(eventExecutor, fileWriter);
LOG.debug("create new file writer {} in {}", fileWriter.getFileID(), eventExecutor);
}
return fileWriter;
}
private void updateReadStat(long time, long size, boolean isLocal) {
if (isLocal) {
fileCacheStat.addLocalRead(size, time);
} else {
fileCacheStat.addDFSRead(size, time);
}
}
}