All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotManagerImpl Maven / Gradle / Ivy

There is a newer version: 1.5.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.state.gemini.engine.snapshot;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.common.typeutils.base.IntSerializer;
import org.apache.flink.api.common.typeutils.base.LongSerializer;
import org.apache.flink.api.common.typeutils.base.MapSerializer;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.runtime.TupleSerializer;
import org.apache.flink.core.fs.FileStatus;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.metrics.MetricGroup;
import org.apache.flink.runtime.state.gemini.engine.dbms.GContext;
import org.apache.flink.runtime.state.gemini.engine.exceptions.GeminiRuntimeException;
import org.apache.flink.runtime.state.gemini.engine.fs.FileCleaner;
import org.apache.flink.runtime.state.gemini.engine.fs.FileIDImpl;
import org.apache.flink.runtime.state.gemini.engine.fs.FileManager;
import org.apache.flink.runtime.state.gemini.engine.fs.FileMeta;
import org.apache.flink.runtime.state.gemini.engine.memstore.WriteBufferManager;
import org.apache.flink.runtime.state.gemini.engine.metrics.SnapshotCompactionMetrics;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.Preconditions;

import org.apache.flink.shaded.guava18.com.google.common.util.concurrent.ThreadFactoryBuilder;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

/**
 * Implementation of {@link SnapshotManager}.
 */
public class SnapshotManagerImpl implements SnapshotManager {

	private static final Logger LOG = LoggerFactory.getLogger(SnapshotManagerImpl.class);

	public static final String SNAPSHOT_DIR = "snapshot";

	public static final String SNAPSHOT_FILE_PREFIX = "snapshot";

	public static final String SNAPSHOT_FILE_SEPERATOR = "-";

	private final boolean localSnapshotEnabled;

	private final FileManager localFileManager;

	private final FileManager dfsFileManager;

	private boolean needToBreakLineage;

	private final WriteBufferManager writeBufferManager;

	private final SortedMap completedSnapshots;

	private final SortedMap runningSnapshots;

	private final SortedSet runningSnapshotAccessNumber;

	private volatile long minRunningSnapshotAccessNumber;

	private final ExecutorService snapshotExecutor;

	private final GContext gContext;

	private final FileCleaner fileCleaner;

	/** Snapshot manager wide lock to safeguard the snapshot updates. */
	private final Object lock = new Object();
	private final SnapshotCompactionStat snapshotCompactionStat;

	@Nullable
	private SnapshotCompactionMetrics snapshotCompactionMetrics;

	public SnapshotManagerImpl(
		GContext gContext,
		WriteBufferManager writeBufferManager,
		FileManager localFileManager,
		FileManager dfsFileManager) {
		this.gContext = gContext;
		this.writeBufferManager = writeBufferManager;
		this.localFileManager = localFileManager;
		this.dfsFileManager = dfsFileManager;
		this.completedSnapshots = new TreeMap<>();
		this.runningSnapshots = new TreeMap<>();
		this.runningSnapshotAccessNumber = new TreeSet<>();
		this.minRunningSnapshotAccessNumber = Long.MAX_VALUE;
		this.localSnapshotEnabled = gContext.getGConfiguration().isLocalSnapshotEnabled();
		String prefix = gContext.getGConfiguration().getExecutorPrefixName();
		ThreadFactory namedThreadFactory = new ThreadFactoryBuilder().setNameFormat(prefix + "geminiMainSnapshot-%d").build();
		this.snapshotExecutor = new ThreadPoolExecutor(1,
			1,
			0L,
			TimeUnit.MILLISECONDS,
			new LinkedBlockingQueue<>(Short.MAX_VALUE),
			namedThreadFactory);
		this.fileCleaner = gContext.getSupervisor().getFileCleaner();

		this.snapshotCompactionStat = new SnapshotCompactionStat();
		MetricGroup dbMetricGroup = gContext.getDBMetricGroup();
		if (dbMetricGroup != null) {
			snapshotCompactionMetrics = new SnapshotCompactionMetrics(
				dbMetricGroup.addGroup("snapshot_compaction"),
				gContext.getGConfiguration().getMetricSampleCount(),
				gContext.getGConfiguration().getMetricHistogramWindowSize());
			snapshotCompactionMetrics.register(snapshotCompactionStat);
		}

		LOG.info("SnapshotManager is created.");
	}

	@Override
	public String getNameSpace() {
		return dfsFileManager.getBasePath().toUri().toString();
	}

	@Override
	public boolean isNeedToBreakLineage() {
		return needToBreakLineage;
	}

	/**
	 * Note this method should be called before restoreLineage.
	 */
	@Override
	public void setNeedToBreakLineage(boolean needToBreakLineage) {
		this.needToBreakLineage = needToBreakLineage;
	}

	@Override
	public Future startSnapshot(BackendSnapshotMeta backendSnapshotMeta) {
		gContext.checkDBStatus();
		long startTime = System.currentTimeMillis();
		long checkpointId = backendSnapshotMeta.getCheckpointId();
		gContext.increaseCurVersion();
		// increment and record access number to protect files used by this snapshot
		// will not be deleted when DB discards them
		long accessNumber = gContext.incrementAndGetAccessNumber();

		LOG.info("GeminiDB start checkpoint {}, start time {}, access number {}.",
			checkpointId,
			startTime,
			accessNumber);

		synchronized (lock) {
			// check argument before adding access number so that there is no need
			// to release access number if exception happens here
			Preconditions.checkArgument(!runningSnapshots.containsKey(checkpointId),
				checkpointId + " is already running.");

			SnapshotOperation snapshotOperation = localSnapshotEnabled ?
				new LocalAndDFSSnapshotOperation(gContext, this, dfsFileManager, localFileManager) :
				new DFSSnapshotOperation(gContext, this, dfsFileManager);
			snapshotOperation.setForceFlushPage(needToBreakLineage);

			PendingSnapshot pendingSnapshot = snapshotOperation.createPendingSnapshot(
				backendSnapshotMeta, accessNumber);
			pendingSnapshot.getSnapshotStat().setSyncStartTime(startTime);
			runningSnapshots.put(checkpointId, pendingSnapshot);

			runningSnapshotAccessNumber.add(accessNumber);
			minRunningSnapshotAccessNumber = runningSnapshotAccessNumber.first();

			SnapshotCompletableFuture snapshotCompletableFuture = pendingSnapshot.getResultFuture();
			snapshotCompletableFuture.whenCompleteAsync((success, throwable) -> endSnapshot(checkpointId, throwable),
				snapshotExecutor);
			snapshotCompletableFuture.incRunningTask();
			try {
				writeBufferManager.doSnapshot(snapshotOperation);
			} finally {
				// if exception happens in try block, decRunningTask will ensure endSnapshot to run,
				// and pending snapshot can be released
				snapshotCompletableFuture.decRunningTask();
				pendingSnapshot.getSnapshotStat().setAsyncStartTime(System.currentTimeMillis());
			}

			return pendingSnapshot.getDbSnapshotResultFuture();
		}
	}

	@Override
	public void endSnapshot(long checkpointId, Throwable throwable) {
		synchronized (lock) {
			PendingSnapshot pendingSnapshot = runningSnapshots.remove(checkpointId);
			if (pendingSnapshot == null) {
				LOG.warn("snapshot {} is not running, and can't be ended.", checkpointId);
				return;
			}
			pendingSnapshot.getSnapshotStat().setCompleteTime(System.currentTimeMillis());

			Throwable suppressedThrowable = null;

			if (!gContext.isDBNormal()) {
				suppressedThrowable = new GeminiRuntimeException(
					"DB is in abnormal status: " + gContext.getDBStatus().name());
			}

			if (throwable != null) {
				suppressedThrowable = ExceptionUtils.firstOrSuppressed(throwable, suppressedThrowable);
			}

			try {
				CompletableFuture dbSnapshotResultFuture = pendingSnapshot.getDbSnapshotResultFuture();
				if (suppressedThrowable == null && !pendingSnapshot.isCanceled()) {
					try {
						CompletedSnapshot completedSnapshot = createCompletedSnapshot(pendingSnapshot);
						completedSnapshots.put(checkpointId, completedSnapshot);
						dbSnapshotResultFuture.complete(pendingSnapshot.getDbSnapshotResult());
						LOG.info("GeminiDB finished checkpoint {}, SnapshotStat {}", checkpointId, pendingSnapshot.getSnapshotStat());
					} catch (Exception exception) {
						LOG.error("Failed to complete snapshot {}", checkpointId, exception);
						discardCheckpointMetaFile(pendingSnapshot.getSnapshotMetaPath().toUri().toString());
						dbSnapshotResultFuture.completeExceptionally(exception);
					}
				} else {
					if (pendingSnapshot.getDbSnapshotResult() != null) {
						discardCheckpointMetaFile(pendingSnapshot.getSnapshotMetaPath().toUri().toString());
					}

					if (pendingSnapshot.isCanceled()) {
						LOG.info("GeminiDB cancel checkpoint {}", checkpointId, suppressedThrowable);
						CancellationException cancellationException = new CancellationException();
						suppressedThrowable = suppressedThrowable == null
							? cancellationException
							: ExceptionUtils.firstOrSuppressed(suppressedThrowable, cancellationException);
					} else {
						LOG.warn("GeminiDB fail to complete checkpoint {}", checkpointId, suppressedThrowable);
					}

					dbSnapshotResultFuture.completeExceptionally(suppressedThrowable);
				}
			} finally {
				pendingSnapshot.releaseResource();
				// update the access number
				runningSnapshotAccessNumber.remove(pendingSnapshot.getAccessNumber());
				minRunningSnapshotAccessNumber = !runningSnapshotAccessNumber.isEmpty()
					? runningSnapshotAccessNumber.first() : Long.MAX_VALUE;
			}
		}
	}

	CompletedSnapshot createCompletedSnapshot(PendingSnapshot pendingSnapshot) throws Exception {
		Set dataFileIDs = new HashSet<>();
		checkFileAmplification(pendingSnapshot);
		updateSnapshotCompactionStat(pendingSnapshot.getSnapshotStat());
		// add data files reference if snapshot successfully.
		for (int id : pendingSnapshot.getFileMapping().keySet()) {
			dfsFileManager.incSnapshotReference(new FileIDImpl(id));
			dataFileIDs.add(id);
		}

		return new CompletedSnapshot(
			pendingSnapshot.getCheckpointId(),
			pendingSnapshot.getSnapshotMetaPath().toUri().toString(),
			dataFileIDs);
	}

	@Override
	public long getMinRunningSnapshotAccessNumber() {
		return minRunningSnapshotAccessNumber;
	}

	/**
	 * Note this method should be called before endSnapshot for the same checkpoint.
	 */
	@Nullable
	@Override
	public PendingSnapshot getPendingSnapshot(long checkpointId) {
		synchronized (lock) {
			return runningSnapshots.get(checkpointId);
		}
	}

	@Override
	public ExecutorService getSnapshotExecutor() {
		return snapshotExecutor;
	}

	@Override
	public void notifySnapshotComplete(long snapshotId) {
		// If need to break lineage before, just set it as true when notified checkpoint complete.
		if (needToBreakLineage) {
			needToBreakLineage = false;
			LOG.info("As checkpoint {} completed, we would no longer need to flush all pages out.", snapshotId);
		}
	}

	@Override
	public void notifySnapshotAbort(long snapshotId) {
		CompletedSnapshot snapshotToAbort = null;
		boolean runningSnapshotCanceled = false;
		synchronized (lock) {
			PendingSnapshot pendingSnapshot = runningSnapshots.get(snapshotId);
			if (pendingSnapshot != null) {
				// TODO currently, set pending checkpoint as cancelled would not interrupt the async checkpoint phase.
				pendingSnapshot.resultFuture.setEndSnapshot();
				pendingSnapshot.setCanceled(true);
				runningSnapshotCanceled = true;
			}

			if (!runningSnapshotCanceled) {
				snapshotToAbort = completedSnapshots.remove(snapshotId);
			}
		}

		if (snapshotToAbort != null) {
			discardCompletedSnapshot(snapshotToAbort);
		}
	}

	@Override
	public void notifySnapshotSubsume(long snapshotId) {
		Set snapshotsToAbort = new HashSet<>();
		synchronized (lock) {
			Iterator> iterator = completedSnapshots.entrySet().iterator();
			while (iterator.hasNext()) {
				Map.Entry entry = iterator.next();
				if (entry.getKey() <= snapshotId) {
					iterator.remove();
					snapshotsToAbort.add(entry.getValue());
				} else {
					break;
				}
			}
		}
		for (CompletedSnapshot completedSnapshot : snapshotsToAbort) {
			discardCompletedSnapshot(completedSnapshot);
		}
	}

	@Override
	public Map restore(
		long snapshotId,
		Map fileMapping,
		String restoredBasePath) {
		Map snapshots;
		if (!needToBreakLineage) {
			snapshots = loadSnapshots(restoredBasePath, Collections.singleton(snapshotId));
			RestoredSnapshot restoredSnapshot = new RestoredSnapshot(
				snapshotId,
				getDFSSnapshotMetaPath(new Path(restoredBasePath), snapshotId).toUri().toString(),
				fileMapping);
			snapshots.put(snapshotId, restoredSnapshot);
			restoreSnapshots(snapshots);
			LOG.info("restore snapshot manager successfully with {} snapshots: {} from {}.", snapshots.size(), snapshots.keySet(), restoredBasePath);
		} else {
			snapshots = Collections.emptyMap();
			LOG.info("no snapshot is restored because lineage needs to be broken");
		}
		return snapshots;
	}

	@Override
	public void close() throws IOException {
		synchronized (lock) {
			snapshotExecutor.shutdownNow();
			LOG.info("SnapshotManager is closed");
			runningSnapshotAccessNumber.clear();
			runningSnapshots.clear();
			completedSnapshots.clear();
		}
	}

	@VisibleForTesting
	public Map getCompletedSnapshots() {
		return Collections.unmodifiableMap(completedSnapshots);
	}

	@VisibleForTesting
	Map getRunningSnapshots() {
		return runningSnapshots;
	}

	@VisibleForTesting
	SortedSet getRunningSnapshotAccessNumber() {
		return runningSnapshotAccessNumber;
	}

	@VisibleForTesting
	void setMinRunningSnapshotAccessNumber(long accessNumber) {
		this.minRunningSnapshotAccessNumber = accessNumber;
	}

	private void restoreSnapshots(Map snapshots) {
		synchronized (lock) {
			for (Map.Entry entry : snapshots.entrySet()) {
				long checkpointId = entry.getKey();
				RestoredSnapshot restoredSnapshot = entry.getValue();
				completedSnapshots.put(checkpointId,
					new CompletedSnapshot(checkpointId,
						restoredSnapshot.getMetaFilePath(),
						restoredSnapshot.getFileMapping().keySet()));
			}
		}
	}

	private Map loadSnapshots(
				String restoredDBPath,
				Set excludeSnapshots) {
		Map snapshots = new HashMap<>();
		Path metaDirPath = new Path(restoredDBPath, SNAPSHOT_DIR);
		FileStatus[] fileStatusArray;
		try {
			fileStatusArray = FileSystem.get(metaDirPath.toUri()).listStatus(metaDirPath);
		} catch (Exception e) {
			LOG.error("failed to list dir status for {} when loading snapshots, {}", metaDirPath, e);
			return snapshots;
		}
		if (fileStatusArray == null) {
			return snapshots;
		}
		for (FileStatus fileStatus : fileStatusArray) {
			Path path = fileStatus.getPath();
			String fileName = path.getName();
			long snapshotId;
			try {
				snapshotId = getSnapshotID(fileName);
			} catch (Exception e) {
				LOG.error("failed to get snapshot ID.", e);
				continue;
			}
			if (excludeSnapshots.contains(snapshotId)) {
				LOG.info("skip to load snapshot {}", snapshotId);
				continue;
			}
			try (SnapshotMetaFile.Reader reader = SnapshotMetaFile.getReader(path)) {
				// TODO checksum
				long fileSize = fileStatus.getLen();
				// record the offset of file mapping
				reader.seek(fileSize - 16);
				long fileMappingOffset = reader.readLong();
				reader.seek(fileMappingOffset);
				boolean hasFileMapping = reader.readBoolean();
				Preconditions.checkState(hasFileMapping, "file mapping should always exist.");
				int fileMappingSize = reader.readInt();
				// just read base path, but do not use it
				reader.readUTF();
				Map fileIDToPath = new HashMap<>();
				for (int i = 0; i < fileMappingSize; ++i) {
					String filePath = reader.readUTF();
					Integer id = reader.readInt();
					// read total data size, but discard it
					reader.readLong();
					fileIDToPath.put(id, filePath);
				}
				snapshots.put(snapshotId, new RestoredSnapshot(
					snapshotId, path.toUri().toString(), fileIDToPath));
				LOG.info("successfully load snapshot {} with {} files", snapshotId, fileIDToPath.size());
			} catch (Exception e) {
				LOG.error("failed to load snapshot {}, {}", snapshotId, e);
			}
		}
		return snapshots;
	}

	public Path getDFSSnapshotMetaPath(Path basePath, long checkpointId) {
		String name = SNAPSHOT_FILE_PREFIX + SNAPSHOT_FILE_SEPERATOR + checkpointId;
		return new Path(basePath, new Path(SNAPSHOT_DIR, name));
	}

	public Path getLocalSnapshotMetaPath(Path basePath, long checkpointId) {
		String name = SNAPSHOT_FILE_PREFIX + SNAPSHOT_FILE_SEPERATOR + checkpointId;
		return new Path(basePath, name);
	}

	@SuppressWarnings("unchecked")
	public MapSerializer>> getFileMappingSerializer() {
		TupleSerializer> tuple2Serializer = new TupleSerializer<>(
			(Class>) (Class) Tuple2.class,
			new TypeSerializer[]{IntSerializer.INSTANCE, LongSerializer.INSTANCE}
		);
		MapSerializer> groupMapSerializer = new MapSerializer<>(
			IntSerializer.INSTANCE, tuple2Serializer);
		return new MapSerializer<>(IntSerializer.INSTANCE, groupMapSerializer);
	}

	private long getSnapshotID(String snapshotMetaName) {
		String[] splits = snapshotMetaName.split(SNAPSHOT_FILE_SEPERATOR);
		if (splits.length == 2 && SNAPSHOT_FILE_PREFIX.equals(splits[0])) {
			try {
				long snapshotId = Long.valueOf(splits[1]);
				if (snapshotId > 0) {
					return snapshotId;
				}
			} catch (Exception e) {
				// parse snapshot failed
			}
		}
		throw new IllegalArgumentException("invalid snapshot meta file name " + snapshotMetaName);
	}

	private void discardCompletedSnapshot(CompletedSnapshot completedSnapshot) {
		for (Integer fileId : completedSnapshot.getDataFileIDs()) {
			dfsFileManager.decSnapshotReference(new FileIDImpl(fileId));
		}

		discardCheckpointMetaFile(completedSnapshot.getMetaFilePath());
		LOG.info("Discard snapshot {} when this snapshot is notified as useless.", completedSnapshot.getCheckpointID());
	}

	private void discardCheckpointMetaFile(String metaFilePath) {
		try {
			fileCleaner.registerFilesToClean(Collections.singleton(metaFilePath));
		} catch (Exception e) {
			LOG.error("Failed to delete snapshot meta file " + metaFilePath, e);
		}
	}

	private void updateSnapshotCompactionStat(SnapshotStat stat) {
		snapshotCompactionStat.setAmplificationRatioBeforeCompaction(stat.getAmplificationRatioBeforeCompaction());
		if (stat.isNeedCompaction()) {
			snapshotCompactionStat.addAndGetNumberCompaction(1);
		}
		snapshotCompactionStat.setCompactionSize(stat.addAndGetIncrementalSize(0) - stat.getIncrementalSizeBeforeCompaction());
		snapshotCompactionStat.setActualAmplificationRatio(stat.getActualAmplificationRatio());
		snapshotCompactionStat.setCompactionDuration(stat.getCompactionEndTime() - stat.getCompactionStartTime());
	}

	/**
	 * Check amplification when the snapshot has been completed, that's all
	 * files have been decided and written to the meta file, and update metric.
	 */
	private void checkFileAmplification(PendingSnapshot pendingSnapshot) {
		// file -> group -> (numPage, dataSize)
		Map>> fileMapping = pendingSnapshot.getFileMapping();
		List infoList = new ArrayList<>();
		long totalFileSize = 0L;
		long totalDataSize = 0L;
		List sharedFiles = new ArrayList<>();
		for (Map.Entry>> entry : fileMapping.entrySet()) {
			int fileId = entry.getKey();
			int numPage = entry.getValue().values().stream().map(t -> t.f0).reduce(0, Integer::sum);
			long dataSize = entry.getValue().values().stream().map(t -> t.f1).reduce(0L, Long::sum);
			FileMeta fileMeta = dfsFileManager.getFileMeta(fileId);
			long fileSize = fileMeta.getFileSize();
			// current snapshot has not added reference to this file
			if (fileMeta.addAndGetSnapshotReference(0) != 0) {
				sharedFiles.add(fileId);
			}
			float ratio = (float) fileSize / dataSize;
			totalFileSize += fileSize;
			totalDataSize += dataSize;
			if (LOG.isDebugEnabled()) {
				infoList.add(new SnapshotCompactionImpl.SnapshotFileInfo(fileId, fileSize, dataSize, numPage, ratio));
			}
		}

		float totalRatio = (float) totalFileSize / totalDataSize;

		if (LOG.isDebugEnabled()) {
			infoList.sort((f1, f2) -> Float.compare(f2.ratio, f1.ratio));
			LOG.debug("completed snapshot {} statistics: number of total files {}, number of shared files {}, "
				+ "total file size {}, total snapshot data size {}, amplification ratio {}",
				pendingSnapshot.getCheckpointId(),
				fileMapping.size(),
				sharedFiles.size(),
				totalFileSize,
				totalDataSize,
				totalRatio);
			LOG.debug("completed snapshot {} file details: {}", pendingSnapshot.getCheckpointId(), infoList);
			LOG.debug("completed snapshot {} shared files: {}", pendingSnapshot.getCheckpointId(), sharedFiles);
		}

		pendingSnapshot.getSnapshotStat().setActualAmplificationRatio(totalRatio);
		snapshotCompactionStat.setActualAmplificationRatio(totalRatio);
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy