Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotManagerImpl Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.state.gemini.engine.snapshot;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.common.typeutils.base.IntSerializer;
import org.apache.flink.api.common.typeutils.base.LongSerializer;
import org.apache.flink.api.common.typeutils.base.MapSerializer;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.runtime.TupleSerializer;
import org.apache.flink.core.fs.FileStatus;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.metrics.MetricGroup;
import org.apache.flink.runtime.state.gemini.engine.dbms.GContext;
import org.apache.flink.runtime.state.gemini.engine.exceptions.GeminiRuntimeException;
import org.apache.flink.runtime.state.gemini.engine.fs.FileCleaner;
import org.apache.flink.runtime.state.gemini.engine.fs.FileIDImpl;
import org.apache.flink.runtime.state.gemini.engine.fs.FileManager;
import org.apache.flink.runtime.state.gemini.engine.fs.FileMeta;
import org.apache.flink.runtime.state.gemini.engine.memstore.WriteBufferManager;
import org.apache.flink.runtime.state.gemini.engine.metrics.SnapshotCompactionMetrics;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.Preconditions;
import org.apache.flink.shaded.guava18.com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
/**
* Implementation of {@link SnapshotManager}.
*/
public class SnapshotManagerImpl implements SnapshotManager {
private static final Logger LOG = LoggerFactory.getLogger(SnapshotManagerImpl.class);
public static final String SNAPSHOT_DIR = "snapshot";
public static final String SNAPSHOT_FILE_PREFIX = "snapshot";
public static final String SNAPSHOT_FILE_SEPERATOR = "-";
private final boolean localSnapshotEnabled;
private final FileManager localFileManager;
private final FileManager dfsFileManager;
private boolean needToBreakLineage;
private final WriteBufferManager writeBufferManager;
private final SortedMap completedSnapshots;
private final SortedMap runningSnapshots;
private final SortedSet runningSnapshotAccessNumber;
private volatile long minRunningSnapshotAccessNumber;
private final ExecutorService snapshotExecutor;
private final GContext gContext;
private final FileCleaner fileCleaner;
/** Snapshot manager wide lock to safeguard the snapshot updates. */
private final Object lock = new Object();
private final SnapshotCompactionStat snapshotCompactionStat;
@Nullable
private SnapshotCompactionMetrics snapshotCompactionMetrics;
public SnapshotManagerImpl(
GContext gContext,
WriteBufferManager writeBufferManager,
FileManager localFileManager,
FileManager dfsFileManager) {
this.gContext = gContext;
this.writeBufferManager = writeBufferManager;
this.localFileManager = localFileManager;
this.dfsFileManager = dfsFileManager;
this.completedSnapshots = new TreeMap<>();
this.runningSnapshots = new TreeMap<>();
this.runningSnapshotAccessNumber = new TreeSet<>();
this.minRunningSnapshotAccessNumber = Long.MAX_VALUE;
this.localSnapshotEnabled = gContext.getGConfiguration().isLocalSnapshotEnabled();
String prefix = gContext.getGConfiguration().getExecutorPrefixName();
ThreadFactory namedThreadFactory = new ThreadFactoryBuilder().setNameFormat(prefix + "geminiMainSnapshot-%d").build();
this.snapshotExecutor = new ThreadPoolExecutor(1,
1,
0L,
TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<>(Short.MAX_VALUE),
namedThreadFactory);
this.fileCleaner = gContext.getSupervisor().getFileCleaner();
this.snapshotCompactionStat = new SnapshotCompactionStat();
MetricGroup dbMetricGroup = gContext.getDBMetricGroup();
if (dbMetricGroup != null) {
snapshotCompactionMetrics = new SnapshotCompactionMetrics(
dbMetricGroup.addGroup("snapshot_compaction"),
gContext.getGConfiguration().getMetricSampleCount(),
gContext.getGConfiguration().getMetricHistogramWindowSize());
snapshotCompactionMetrics.register(snapshotCompactionStat);
}
LOG.info("SnapshotManager is created.");
}
@Override
public String getNameSpace() {
return dfsFileManager.getBasePath().toUri().toString();
}
@Override
public boolean isNeedToBreakLineage() {
return needToBreakLineage;
}
/**
* Note this method should be called before restoreLineage.
*/
@Override
public void setNeedToBreakLineage(boolean needToBreakLineage) {
this.needToBreakLineage = needToBreakLineage;
}
@Override
public Future startSnapshot(BackendSnapshotMeta backendSnapshotMeta) {
gContext.checkDBStatus();
long startTime = System.currentTimeMillis();
long checkpointId = backendSnapshotMeta.getCheckpointId();
gContext.increaseCurVersion();
// increment and record access number to protect files used by this snapshot
// will not be deleted when DB discards them
long accessNumber = gContext.incrementAndGetAccessNumber();
LOG.info("GeminiDB start checkpoint {}, start time {}, access number {}.",
checkpointId,
startTime,
accessNumber);
synchronized (lock) {
// check argument before adding access number so that there is no need
// to release access number if exception happens here
Preconditions.checkArgument(!runningSnapshots.containsKey(checkpointId),
checkpointId + " is already running.");
SnapshotOperation snapshotOperation = localSnapshotEnabled ?
new LocalAndDFSSnapshotOperation(gContext, this, dfsFileManager, localFileManager) :
new DFSSnapshotOperation(gContext, this, dfsFileManager);
snapshotOperation.setForceFlushPage(needToBreakLineage);
PendingSnapshot pendingSnapshot = snapshotOperation.createPendingSnapshot(
backendSnapshotMeta, accessNumber);
pendingSnapshot.getSnapshotStat().setSyncStartTime(startTime);
runningSnapshots.put(checkpointId, pendingSnapshot);
runningSnapshotAccessNumber.add(accessNumber);
minRunningSnapshotAccessNumber = runningSnapshotAccessNumber.first();
SnapshotCompletableFuture snapshotCompletableFuture = pendingSnapshot.getResultFuture();
snapshotCompletableFuture.whenCompleteAsync((success, throwable) -> endSnapshot(checkpointId, throwable),
snapshotExecutor);
snapshotCompletableFuture.incRunningTask();
try {
writeBufferManager.doSnapshot(snapshotOperation);
} finally {
// if exception happens in try block, decRunningTask will ensure endSnapshot to run,
// and pending snapshot can be released
snapshotCompletableFuture.decRunningTask();
pendingSnapshot.getSnapshotStat().setAsyncStartTime(System.currentTimeMillis());
}
return pendingSnapshot.getDbSnapshotResultFuture();
}
}
@Override
public void endSnapshot(long checkpointId, Throwable throwable) {
synchronized (lock) {
PendingSnapshot pendingSnapshot = runningSnapshots.remove(checkpointId);
if (pendingSnapshot == null) {
LOG.warn("snapshot {} is not running, and can't be ended.", checkpointId);
return;
}
pendingSnapshot.getSnapshotStat().setCompleteTime(System.currentTimeMillis());
Throwable suppressedThrowable = null;
if (!gContext.isDBNormal()) {
suppressedThrowable = new GeminiRuntimeException(
"DB is in abnormal status: " + gContext.getDBStatus().name());
}
if (throwable != null) {
suppressedThrowable = ExceptionUtils.firstOrSuppressed(throwable, suppressedThrowable);
}
try {
CompletableFuture dbSnapshotResultFuture = pendingSnapshot.getDbSnapshotResultFuture();
if (suppressedThrowable == null && !pendingSnapshot.isCanceled()) {
try {
CompletedSnapshot completedSnapshot = createCompletedSnapshot(pendingSnapshot);
completedSnapshots.put(checkpointId, completedSnapshot);
dbSnapshotResultFuture.complete(pendingSnapshot.getDbSnapshotResult());
LOG.info("GeminiDB finished checkpoint {}, SnapshotStat {}", checkpointId, pendingSnapshot.getSnapshotStat());
} catch (Exception exception) {
LOG.error("Failed to complete snapshot {}", checkpointId, exception);
discardCheckpointMetaFile(pendingSnapshot.getSnapshotMetaPath().toUri().toString());
dbSnapshotResultFuture.completeExceptionally(exception);
}
} else {
if (pendingSnapshot.getDbSnapshotResult() != null) {
discardCheckpointMetaFile(pendingSnapshot.getSnapshotMetaPath().toUri().toString());
}
if (pendingSnapshot.isCanceled()) {
LOG.info("GeminiDB cancel checkpoint {}", checkpointId, suppressedThrowable);
CancellationException cancellationException = new CancellationException();
suppressedThrowable = suppressedThrowable == null
? cancellationException
: ExceptionUtils.firstOrSuppressed(suppressedThrowable, cancellationException);
} else {
LOG.warn("GeminiDB fail to complete checkpoint {}", checkpointId, suppressedThrowable);
}
dbSnapshotResultFuture.completeExceptionally(suppressedThrowable);
}
} finally {
pendingSnapshot.releaseResource();
// update the access number
runningSnapshotAccessNumber.remove(pendingSnapshot.getAccessNumber());
minRunningSnapshotAccessNumber = !runningSnapshotAccessNumber.isEmpty()
? runningSnapshotAccessNumber.first() : Long.MAX_VALUE;
}
}
}
CompletedSnapshot createCompletedSnapshot(PendingSnapshot pendingSnapshot) throws Exception {
Set dataFileIDs = new HashSet<>();
checkFileAmplification(pendingSnapshot);
updateSnapshotCompactionStat(pendingSnapshot.getSnapshotStat());
// add data files reference if snapshot successfully.
for (int id : pendingSnapshot.getFileMapping().keySet()) {
dfsFileManager.incSnapshotReference(new FileIDImpl(id));
dataFileIDs.add(id);
}
return new CompletedSnapshot(
pendingSnapshot.getCheckpointId(),
pendingSnapshot.getSnapshotMetaPath().toUri().toString(),
dataFileIDs);
}
@Override
public long getMinRunningSnapshotAccessNumber() {
return minRunningSnapshotAccessNumber;
}
/**
* Note this method should be called before endSnapshot for the same checkpoint.
*/
@Nullable
@Override
public PendingSnapshot getPendingSnapshot(long checkpointId) {
synchronized (lock) {
return runningSnapshots.get(checkpointId);
}
}
@Override
public ExecutorService getSnapshotExecutor() {
return snapshotExecutor;
}
@Override
public void notifySnapshotComplete(long snapshotId) {
// If need to break lineage before, just set it as true when notified checkpoint complete.
if (needToBreakLineage) {
needToBreakLineage = false;
LOG.info("As checkpoint {} completed, we would no longer need to flush all pages out.", snapshotId);
}
}
@Override
public void notifySnapshotAbort(long snapshotId) {
CompletedSnapshot snapshotToAbort = null;
boolean runningSnapshotCanceled = false;
synchronized (lock) {
PendingSnapshot pendingSnapshot = runningSnapshots.get(snapshotId);
if (pendingSnapshot != null) {
// TODO currently, set pending checkpoint as cancelled would not interrupt the async checkpoint phase.
pendingSnapshot.resultFuture.setEndSnapshot();
pendingSnapshot.setCanceled(true);
runningSnapshotCanceled = true;
}
if (!runningSnapshotCanceled) {
snapshotToAbort = completedSnapshots.remove(snapshotId);
}
}
if (snapshotToAbort != null) {
discardCompletedSnapshot(snapshotToAbort);
}
}
@Override
public void notifySnapshotSubsume(long snapshotId) {
Set snapshotsToAbort = new HashSet<>();
synchronized (lock) {
Iterator> iterator = completedSnapshots.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry entry = iterator.next();
if (entry.getKey() <= snapshotId) {
iterator.remove();
snapshotsToAbort.add(entry.getValue());
} else {
break;
}
}
}
for (CompletedSnapshot completedSnapshot : snapshotsToAbort) {
discardCompletedSnapshot(completedSnapshot);
}
}
@Override
public Map restore(
long snapshotId,
Map fileMapping,
String restoredBasePath) {
Map snapshots;
if (!needToBreakLineage) {
snapshots = loadSnapshots(restoredBasePath, Collections.singleton(snapshotId));
RestoredSnapshot restoredSnapshot = new RestoredSnapshot(
snapshotId,
getDFSSnapshotMetaPath(new Path(restoredBasePath), snapshotId).toUri().toString(),
fileMapping);
snapshots.put(snapshotId, restoredSnapshot);
restoreSnapshots(snapshots);
LOG.info("restore snapshot manager successfully with {} snapshots: {} from {}.", snapshots.size(), snapshots.keySet(), restoredBasePath);
} else {
snapshots = Collections.emptyMap();
LOG.info("no snapshot is restored because lineage needs to be broken");
}
return snapshots;
}
@Override
public void close() throws IOException {
synchronized (lock) {
snapshotExecutor.shutdownNow();
LOG.info("SnapshotManager is closed");
runningSnapshotAccessNumber.clear();
runningSnapshots.clear();
completedSnapshots.clear();
}
}
@VisibleForTesting
public Map getCompletedSnapshots() {
return Collections.unmodifiableMap(completedSnapshots);
}
@VisibleForTesting
Map getRunningSnapshots() {
return runningSnapshots;
}
@VisibleForTesting
SortedSet getRunningSnapshotAccessNumber() {
return runningSnapshotAccessNumber;
}
@VisibleForTesting
void setMinRunningSnapshotAccessNumber(long accessNumber) {
this.minRunningSnapshotAccessNumber = accessNumber;
}
private void restoreSnapshots(Map snapshots) {
synchronized (lock) {
for (Map.Entry entry : snapshots.entrySet()) {
long checkpointId = entry.getKey();
RestoredSnapshot restoredSnapshot = entry.getValue();
completedSnapshots.put(checkpointId,
new CompletedSnapshot(checkpointId,
restoredSnapshot.getMetaFilePath(),
restoredSnapshot.getFileMapping().keySet()));
}
}
}
private Map loadSnapshots(
String restoredDBPath,
Set excludeSnapshots) {
Map snapshots = new HashMap<>();
Path metaDirPath = new Path(restoredDBPath, SNAPSHOT_DIR);
FileStatus[] fileStatusArray;
try {
fileStatusArray = FileSystem.get(metaDirPath.toUri()).listStatus(metaDirPath);
} catch (Exception e) {
LOG.error("failed to list dir status for {} when loading snapshots, {}", metaDirPath, e);
return snapshots;
}
if (fileStatusArray == null) {
return snapshots;
}
for (FileStatus fileStatus : fileStatusArray) {
Path path = fileStatus.getPath();
String fileName = path.getName();
long snapshotId;
try {
snapshotId = getSnapshotID(fileName);
} catch (Exception e) {
LOG.error("failed to get snapshot ID.", e);
continue;
}
if (excludeSnapshots.contains(snapshotId)) {
LOG.info("skip to load snapshot {}", snapshotId);
continue;
}
try (SnapshotMetaFile.Reader reader = SnapshotMetaFile.getReader(path)) {
// TODO checksum
long fileSize = fileStatus.getLen();
// record the offset of file mapping
reader.seek(fileSize - 16);
long fileMappingOffset = reader.readLong();
reader.seek(fileMappingOffset);
boolean hasFileMapping = reader.readBoolean();
Preconditions.checkState(hasFileMapping, "file mapping should always exist.");
int fileMappingSize = reader.readInt();
// just read base path, but do not use it
reader.readUTF();
Map fileIDToPath = new HashMap<>();
for (int i = 0; i < fileMappingSize; ++i) {
String filePath = reader.readUTF();
Integer id = reader.readInt();
// read total data size, but discard it
reader.readLong();
fileIDToPath.put(id, filePath);
}
snapshots.put(snapshotId, new RestoredSnapshot(
snapshotId, path.toUri().toString(), fileIDToPath));
LOG.info("successfully load snapshot {} with {} files", snapshotId, fileIDToPath.size());
} catch (Exception e) {
LOG.error("failed to load snapshot {}, {}", snapshotId, e);
}
}
return snapshots;
}
public Path getDFSSnapshotMetaPath(Path basePath, long checkpointId) {
String name = SNAPSHOT_FILE_PREFIX + SNAPSHOT_FILE_SEPERATOR + checkpointId;
return new Path(basePath, new Path(SNAPSHOT_DIR, name));
}
public Path getLocalSnapshotMetaPath(Path basePath, long checkpointId) {
String name = SNAPSHOT_FILE_PREFIX + SNAPSHOT_FILE_SEPERATOR + checkpointId;
return new Path(basePath, name);
}
@SuppressWarnings("unchecked")
public MapSerializer>> getFileMappingSerializer() {
TupleSerializer> tuple2Serializer = new TupleSerializer<>(
(Class>) (Class) Tuple2.class,
new TypeSerializer[]{IntSerializer.INSTANCE, LongSerializer.INSTANCE}
);
MapSerializer> groupMapSerializer = new MapSerializer<>(
IntSerializer.INSTANCE, tuple2Serializer);
return new MapSerializer<>(IntSerializer.INSTANCE, groupMapSerializer);
}
private long getSnapshotID(String snapshotMetaName) {
String[] splits = snapshotMetaName.split(SNAPSHOT_FILE_SEPERATOR);
if (splits.length == 2 && SNAPSHOT_FILE_PREFIX.equals(splits[0])) {
try {
long snapshotId = Long.valueOf(splits[1]);
if (snapshotId > 0) {
return snapshotId;
}
} catch (Exception e) {
// parse snapshot failed
}
}
throw new IllegalArgumentException("invalid snapshot meta file name " + snapshotMetaName);
}
private void discardCompletedSnapshot(CompletedSnapshot completedSnapshot) {
for (Integer fileId : completedSnapshot.getDataFileIDs()) {
dfsFileManager.decSnapshotReference(new FileIDImpl(fileId));
}
discardCheckpointMetaFile(completedSnapshot.getMetaFilePath());
LOG.info("Discard snapshot {} when this snapshot is notified as useless.", completedSnapshot.getCheckpointID());
}
private void discardCheckpointMetaFile(String metaFilePath) {
try {
fileCleaner.registerFilesToClean(Collections.singleton(metaFilePath));
} catch (Exception e) {
LOG.error("Failed to delete snapshot meta file " + metaFilePath, e);
}
}
private void updateSnapshotCompactionStat(SnapshotStat stat) {
snapshotCompactionStat.setAmplificationRatioBeforeCompaction(stat.getAmplificationRatioBeforeCompaction());
if (stat.isNeedCompaction()) {
snapshotCompactionStat.addAndGetNumberCompaction(1);
}
snapshotCompactionStat.setCompactionSize(stat.addAndGetIncrementalSize(0) - stat.getIncrementalSizeBeforeCompaction());
snapshotCompactionStat.setActualAmplificationRatio(stat.getActualAmplificationRatio());
snapshotCompactionStat.setCompactionDuration(stat.getCompactionEndTime() - stat.getCompactionStartTime());
}
/**
* Check amplification when the snapshot has been completed, that's all
* files have been decided and written to the meta file, and update metric.
*/
private void checkFileAmplification(PendingSnapshot pendingSnapshot) {
// file -> group -> (numPage, dataSize)
Map>> fileMapping = pendingSnapshot.getFileMapping();
List infoList = new ArrayList<>();
long totalFileSize = 0L;
long totalDataSize = 0L;
List sharedFiles = new ArrayList<>();
for (Map.Entry>> entry : fileMapping.entrySet()) {
int fileId = entry.getKey();
int numPage = entry.getValue().values().stream().map(t -> t.f0).reduce(0, Integer::sum);
long dataSize = entry.getValue().values().stream().map(t -> t.f1).reduce(0L, Long::sum);
FileMeta fileMeta = dfsFileManager.getFileMeta(fileId);
long fileSize = fileMeta.getFileSize();
// current snapshot has not added reference to this file
if (fileMeta.addAndGetSnapshotReference(0) != 0) {
sharedFiles.add(fileId);
}
float ratio = (float) fileSize / dataSize;
totalFileSize += fileSize;
totalDataSize += dataSize;
if (LOG.isDebugEnabled()) {
infoList.add(new SnapshotCompactionImpl.SnapshotFileInfo(fileId, fileSize, dataSize, numPage, ratio));
}
}
float totalRatio = (float) totalFileSize / totalDataSize;
if (LOG.isDebugEnabled()) {
infoList.sort((f1, f2) -> Float.compare(f2.ratio, f1.ratio));
LOG.debug("completed snapshot {} statistics: number of total files {}, number of shared files {}, "
+ "total file size {}, total snapshot data size {}, amplification ratio {}",
pendingSnapshot.getCheckpointId(),
fileMapping.size(),
sharedFiles.size(),
totalFileSize,
totalDataSize,
totalRatio);
LOG.debug("completed snapshot {} file details: {}", pendingSnapshot.getCheckpointId(), infoList);
LOG.debug("completed snapshot {} shared files: {}", pendingSnapshot.getCheckpointId(), sharedFiles);
}
pendingSnapshot.getSnapshotStat().setActualAmplificationRatio(totalRatio);
snapshotCompactionStat.setActualAmplificationRatio(totalRatio);
}
}