All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.state.gemini.engine.restore.GeminiRestoreOperation Maven / Gradle / Ivy

There is a newer version: 1.5.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.state.gemini.engine.restore;

import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.common.typeutils.base.IntSerializer;
import org.apache.flink.api.common.typeutils.base.LongSerializer;
import org.apache.flink.api.common.typeutils.base.MapSerializer;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.runtime.TupleSerializer;
import org.apache.flink.core.fs.CloseableRegistry;
import org.apache.flink.core.fs.FSDataInputStream;
import org.apache.flink.core.fs.FSDataOutputStream;
import org.apache.flink.core.fs.FileStatus;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.core.memory.DataInputView;
import org.apache.flink.core.memory.DataInputViewStreamWrapper;
import org.apache.flink.runtime.concurrent.FutureUtils;
import org.apache.flink.runtime.state.gemini.engine.GConfiguration;
import org.apache.flink.runtime.state.gemini.engine.GRegion;
import org.apache.flink.runtime.state.gemini.engine.GRegionContext;
import org.apache.flink.runtime.state.gemini.engine.GRegionID;
import org.apache.flink.runtime.state.gemini.engine.GRegionIDImpl;
import org.apache.flink.runtime.state.gemini.engine.GTable;
import org.apache.flink.runtime.state.gemini.engine.dbms.GContext;
import org.apache.flink.runtime.state.gemini.engine.dbms.Supervisor;
import org.apache.flink.runtime.state.gemini.engine.fs.FileManager;
import org.apache.flink.runtime.state.gemini.engine.fs.FileMeta;
import org.apache.flink.runtime.state.gemini.engine.page.PageAddress;
import org.apache.flink.runtime.state.gemini.engine.page.PageIndex;
import org.apache.flink.runtime.state.gemini.engine.page.PageIndexHashImpl;
import org.apache.flink.runtime.state.gemini.engine.snapshot.DBSnapshotMeta;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotManager;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotMetaFile;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.FileUtils;
import org.apache.flink.util.FlinkRuntimeException;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.StringUtils;
import org.apache.flink.util.function.ThrowingRunnable;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import static org.apache.flink.runtime.concurrent.Executors.newDirectExecutorService;

/**
 * Restore operation for Gemini.
 */
public class GeminiRestoreOperation {
	private static final Logger LOG = LoggerFactory.getLogger(GeminiRestoreOperation.class);

	private final GContext context;
	private final GConfiguration configuration;
	private final Supervisor supervisor;
	private final CloseableRegistry closeableRegistry;

	public GeminiRestoreOperation(GContext context, GConfiguration configuration, CloseableRegistry closeableRegistry) {
		this.context = context;
		this.configuration = configuration;
		this.supervisor = this.context.getSupervisor();
		this.closeableRegistry = closeableRegistry;
	}

	/**
	 * Restore from given snapshot metas and tables to restore.
	 *
	 * @param snapshotMetas Collection of restored snapshot metas, if size more than 1, means rescaling.
	 * @param restoredTables The tables to restore which recorded from snapshot handle's meta.
	 * @param startRegionId The start region id to restore (inclusive)
	 * @param endRegionId The end region id to restore (inclusive)
	 * @throws Exception
	 */
	@SuppressWarnings("unchecked")
	public void restore(
		Collection snapshotMetas,
		Map restoredTables,
		int startRegionId,
		int endRegionId) throws Exception {

		long startTime = System.currentTimeMillis();

		MapSerializer>> fileMappingSerializer = getFileMappingSerializer();

		long restoredCheckpointId = 0;

		// dfs/local file mapping to restore for DB.
		Set restoredDfsBasePaths = new HashSet<>(), restoredLocalBasePaths = new HashSet<>();
		Map restoredDfsFileMetas = new HashMap<>(), restoredLocalFileMetas = new HashMap<>();
		// {fileId -> {group, }}
		Map>> dfsFileMapping = new HashMap<>(), localFileMapping = new HashMap<>();

		// TODO #SR error handle.
		LOG.info("Start to restore from snapshot for GeminiDB, snapshotMetas {}, tables {}, region from {} to {}.",
			snapshotMetas, restoredTables, startRegionId, endRegionId);
		boolean needToBreakLineage = false;
		boolean needToFetchFiles = configuration.isEnableRestorePreFetch();

		for (DBSnapshotMeta meta : snapshotMetas) {
			restoredCheckpointId = meta.getCheckPointId();
			long[] regionOffsets = meta.getRegionOffsets();

			int metaStartRegionId = meta.getStartRegionId();
			int metaEndRegionId = meta.getEndRegionId();
			int startGroup = Math.max(startRegionId, metaStartRegionId);
			int endGroup = Math.min(endRegionId, metaEndRegionId);
			if (!needToBreakLineage) {
				if (!supervisor.getSnapshotManager().getNameSpace().equals(meta.getNameSpace())) {
					needToBreakLineage = true;
					LOG.info("Current name space {} differs from previous snapshot's name space {}," +
						" enforce the first snapshot to flush all pages out.", supervisor.getSnapshotManager().getNameSpace(), meta.getNameSpace());
				}
			}
			Preconditions.checkArgument(startGroup <= endGroup,
				String.format("Useless meta (key-group range [%s, %s]) should not be restored within this state-backend (key-group range [%s, %s])",
					metaStartRegionId, metaEndRegionId, startRegionId, endRegionId));

			try (SnapshotMetaFile.Reader reader = SnapshotMetaFile.getReader(new Path(meta.getSnapshotMetaPath()))) {
				int tableSize = reader.readInt();

				reader.seek(regionOffsets[startGroup - metaStartRegionId]);
				for (int group = startGroup; group <= endGroup; group++) {
					int writtenKeyGroupIndex = reader.readInt();
					Preconditions.checkState(writtenKeyGroupIndex == group,
						"Unexpected key-group in restore.");

					for (int i = 0; i < tableSize; i++) {
						// 1. restore table.
						String tableName = reader.readUTF();

						int expectedRegions = reader.readInt();
						// 2. restore regions within this snapshot meta.
						for (int j = 0; j < expectedRegions; j++) {
							int regionCode = reader.readInt();
							GRegionID gRegionId = new GRegionIDImpl(regionCode);
							int regionId = gRegionId.getId();
							long lastSeqId = reader.readLong();
							long removeAllSeqId = reader.readLong();
							GTable table = restoredTables.get(tableName);
							GRegionContext regionContext = new GRegionContext(
								context, tableName, gRegionId, table.getTableDescription().getPageSerde(), lastSeqId, removeAllSeqId);
							// 3. restore logical page chains of this region.
							PageIndex pageIndex = new PageIndexHashImpl.Builder(reader, regionContext).build();

							// index 0 is data. now only support 1 index. TODO support more index.
							if (gRegionId.getIndexID() == 0) {
								// data region.
								table.setRegion(regionId,
									table.getTableDescription().createRegion(context, table, gRegionId, pageIndex));
							} else {
								// index region
								table.setIndexRegion(regionId,
									table.getIndexDescription().createRegion(context, table, gRegionId, pageIndex));
							}
						}
					}
				}

				reader.seek(meta.getDfsFileMappingOffset());
				// 4. restore dfs file mapping.
				Tuple2>>, String> dfsFileMappingAndPath =
					restoreFileMapping(reader, restoredDfsFileMetas, fileMappingSerializer);
				Preconditions.checkNotNull(dfsFileMappingAndPath, "dfs meta should always has file mapping");
				if (dfsFileMappingAndPath.f0 != null) {
					loadFromRestoredFileMapping(dfsFileMappingAndPath.f0, dfsFileMapping);
				}
				restoredDfsBasePaths.add(dfsFileMappingAndPath.f1);

				// 5. restore local file mapping if possible.
				Tuple2>>, String> localFileMappingAndPath =
					restoreFileMapping(reader, restoredLocalFileMetas, fileMappingSerializer);
				if (localFileMappingAndPath != null) {
					// as local file mapping existed, we no longer need to fetch all files locally.
					needToFetchFiles = false;
					if (localFileMappingAndPath.f0 != null) {
						loadFromRestoredFileMapping(localFileMappingAndPath.f0, localFileMapping);
					}
					restoredLocalBasePaths.add(localFileMappingAndPath.f1);
				}
			}
		}

		Preconditions.checkState(!restoredDfsBasePaths.isEmpty(), "restored dfs base path should not be empty.");

		SnapshotManager snapshotManager = supervisor.getSnapshotManager();
		snapshotManager.setNeedToBreakLineage(needToBreakLineage);

		Map dbUsedFileMeta = dfsFileMapping.size() == 0 ? new HashMap<>() :
			getRestoredFileMetaUsedByDB(restoredDfsFileMetas, dfsFileMapping, startRegionId, endRegionId, !needToBreakLineage);

		FileManager dfsFileManager = supervisor.getDfsFileManager();

		// 6. load snapshots from remote path one by one if possible.
		Map restoredSnapshots = snapshotManager.restore(
			restoredCheckpointId,
			getMappingForFileIdToPath(dbUsedFileMeta),
			restoredDfsBasePaths.iterator().next());

		// 7. use restored dfs file mapping to update dfs file manager.
		dfsFileManager.restore(mergeDbAndSnapshotFileMeta(dbUsedFileMeta, restoredSnapshots, !needToBreakLineage));

		FileManager localFileManager = supervisor.getLocalFileManager();
		// 8. use restored local file mapping to update local file manager if possible.
		Path localBasePath = localFileManager.getBasePath();
		File localBaseFile = new File(localBasePath.getPath());
		if (localFileMapping.size() > 0) {
			createHardLinkForRestoredLocalFile(restoredLocalBasePaths.iterator().next(),
				restoredLocalFileMetas, localBasePath);
			// currently local state use hard link, and the restored files should always
			// be deleted by the current DB.
			localFileManager.restore(getRestoredFileMetaUsedByDB(
				restoredLocalFileMetas, localFileMapping, startRegionId, endRegionId, true));
		}

		LOG.info("Restored successfully from {} for {}, region from {} to {}, consumed {} ms.",
			snapshotMetas, restoredTables, startRegionId, endRegionId, System.currentTimeMillis() - startTime);

		// 9. if fetch files if enabled, and no local files (perhaps restore from checkpoint not failover), download files to local.
		if (needToFetchFiles) {
			long startToFetchTime = System.currentTimeMillis();
			boolean success = false;
			// TODO implement the async mode to download files but not update local address directly.
			int threadNum = configuration.getFetchFilesThreadNum();
			try {
				// we would download all files from dfs file manager's to local

				Map remoteToLocalFilePaths = new HashMap<>(dbUsedFileMeta.size());
				Map dbUsedLocalFileMeta = new HashMap<>(dbUsedFileMeta.size());
				for (Map.Entry entry : dbUsedFileMeta.entrySet()) {
					FileMeta.RestoredFileMeta fileMeta = entry.getValue();
					String remotePath = fileMeta.filePath;
					Path localPath = new Path(localBasePath, extractFileName(remotePath));
					remoteToLocalFilePaths.put(remotePath, localPath);
					dbUsedLocalFileMeta.put(entry.getKey(),
						FileMeta.RestoredFileMeta.of(
							fileMeta.id,
							localPath.toString(),
							fileMeta.fileSize,
							fileMeta.dataSize,
							fileMeta.dbReference,
							0,
							true));
				}
				// 9.1. download files to local
				downloadDataForAllFiles(remoteToLocalFilePaths, threadNum, closeableRegistry);

				// 9.2. restore local file manager with dbUsedFileMeta
				localFileManager.restore(dbUsedLocalFileMeta);

				// 9.3. update local address in page address synchronously
				for (GTable gTable : supervisor.getAllTables().values()) {
					Iterator iterator = gTable.regionIterator();
					while (iterator.hasNext()) {
						GRegion region = iterator.next();
						Iterator pageAddressIterator = region.getPageStore().getPageIndex().pageIterator();
						while (pageAddressIterator.hasNext()) {
							PageAddress pageAddress = pageAddressIterator.next();
							long dfsAddress = pageAddress.getDfsAddress();
							pageAddress.setLocalAddress(dfsAddress);
							pageAddress.setLocalStatus(true);
						}
					}
				}
				success = true;
			} finally {
				if (!success) {
					FileStatus[] fileStatuses = localBasePath.getFileSystem().listStatus(localBasePath);
					if (fileStatuses != null) {
						LOG.info("Cleaning local downloaded files when not successful to restore.");
						FileUtils.deleteDirectoryQuietly(localBaseFile);
					}
					LOG.info("Failed to fetch files to local or re-construct the local address, consumed {} ms.", System.currentTimeMillis() - startToFetchTime);
				} else {
					LOG.info("Successfully fetch files to local and re-construct the local address, consumed {} ms.", System.currentTimeMillis() - startToFetchTime);
				}
			}
			// TODO fill data into memory if necessary
		}
	}

	private void downloadDataForAllFiles(
		Map remoteFilePaths,
		int restoringThreadNum,
		CloseableRegistry closeableRegistry) throws Exception {

		final ExecutorService executorService = restoringThreadNum > 1 ?
			Executors.newFixedThreadPool(restoringThreadNum): newDirectExecutorService();
		try {
			List runnables = createDownloadRunnables(remoteFilePaths, closeableRegistry);
			List> futures = new ArrayList<>(runnables.size());
			for (Runnable runnable : runnables) {
				futures.add(CompletableFuture.runAsync(runnable, executorService));
			}
			FutureUtils.waitForAll(futures).get();
		} catch (ExecutionException e) {
			Throwable throwable = ExceptionUtils.stripExecutionException(e);
			throwable = ExceptionUtils.stripException(throwable, RuntimeException.class);
			if (throwable instanceof IOException) {
				throw (IOException) throwable;
			} else {
				throw new FlinkRuntimeException(e);
			}
		} finally {
			executorService.shutdownNow();
		}
	}

	private List createDownloadRunnables(
		Map remoteFilePaths,
		CloseableRegistry closeableRegistry) {
		List runnables = new ArrayList<>(remoteFilePaths.size());
		for (Map.Entry entry : remoteFilePaths.entrySet()) {
			runnables.add(ThrowingRunnable.unchecked(
				() -> downloadData(new Path(entry.getKey()), entry.getValue(), closeableRegistry)));
		}
		return runnables;
	}

	private String extractFileName(String filePath) throws IOException {
		int lastIndexOfSeparator = filePath.lastIndexOf("/");
		String fileName = filePath.substring(lastIndexOfSeparator + 1);
		if (StringUtils.isNullOrWhitespaceOnly(fileName)) {
			throw new IOException("Fail to extract file name from given file path " + filePath);
		}
		return fileName;
	}

	private void downloadData(
		Path remoteFilePath,
		Path restoreFilePath,
		CloseableRegistry closeableRegistry) throws IOException {
		FileSystem restoreFileSystem = restoreFilePath.getFileSystem();

		FSDataInputStream inputStream = null;
		FSDataOutputStream outputStream = null;

		try {
			inputStream = remoteFilePath.getFileSystem().open(remoteFilePath);
			closeableRegistry.registerCloseable(inputStream);

			outputStream = restoreFileSystem.create(restoreFilePath, FileSystem.WriteMode.OVERWRITE);
			closeableRegistry.registerCloseable(outputStream);

			byte[] buffer = new byte[64 * 1024];
			while (true) {
				int numBytes = inputStream.read(buffer);
				if (numBytes == -1) {
					break;
				}

				outputStream.write(buffer, 0, numBytes);
			}
		} finally {
			if (closeableRegistry.unregisterCloseable(inputStream)) {
				inputStream.close();
			}

			if (closeableRegistry.unregisterCloseable(outputStream)) {
				outputStream.close();
			}
		}
	}

		@SuppressWarnings("unchecked")
	private MapSerializer>> getFileMappingSerializer() {
		TupleSerializer> tuple2Serializer = new TupleSerializer<>(
			(Class>) (Class) Tuple2.class,
			new TypeSerializer[]{IntSerializer.INSTANCE, LongSerializer.INSTANCE}
		);
		MapSerializer> groupMapSerializer = new MapSerializer<>(
			IntSerializer.INSTANCE, tuple2Serializer);
		return new MapSerializer<>(IntSerializer.INSTANCE, groupMapSerializer);
	}

	/**
	 * Returns the filtered file mapping {fileId -> {groupId -> [reference, size]}} and the base restore path.
	 */
	private Tuple2>>, String> restoreFileMapping(
		SnapshotMetaFile.Reader reader,
		Map restoredFileMetas,
		MapSerializer>> fileMappingSerializer) throws IOException {

		boolean hasFileMapping = reader.readBoolean();
		if (hasFileMapping) {
			int fileMappingSize = reader.readInt();
			String restoredBasePath = reader.readUTF();
			if (fileMappingSize > 0) {
				for (int i = 0; i < fileMappingSize; ++i) {
					String filePath = reader.readUTF();
					int id = reader.readInt();
					long fileSize = reader.readLong();
					restoredFileMetas.put(id, FileMeta.RestoredFileMeta.of(id, filePath, fileSize));
				}
				DataInputView dataInputView = new DataInputViewStreamWrapper(reader);
				return Tuple2.of(fileMappingSerializer.deserialize(dataInputView), restoredBasePath);
			}
			return Tuple2.of(null, restoredBasePath);
		} else {
			return null;
		}
	}

	private void loadFromRestoredFileMapping(
		Map>> restoredFileMapping,
		Map>> fileMapping) {
		for (Map.Entry>> mapEntry : restoredFileMapping.entrySet()) {
			Map> map = fileMapping.get(mapEntry.getKey());
			if (map == null) {
				fileMapping.put(mapEntry.getKey(), mapEntry.getValue());
			} else {
				// file ids are unique.
				map.putAll(mapEntry.getValue());
			}
		}
	}

	/**
	 * Reconstruct file metas from fileMapping, which is {fileId, {group, [reference, size]}}
	 */
	private Map getRestoredFileMetaUsedByDB(
		Map restoredFileMetas,
		Map>> fileMapping,
		int startRegionId,
		int endRegionId,
		boolean canDeleteFile) {

		Map resultFileMetas = new HashMap<>(restoredFileMetas.size());
		for (Map.Entry>> entry : fileMapping.entrySet()) {
			int fileId = entry.getKey();
			FileMeta.RestoredFileMeta fileMeta = restoredFileMetas.get(fileId);
			long dataSize = 0;
			int dbReference = 0;
			Map> groupInfo = entry.getValue();
			for (Map.Entry> e : groupInfo.entrySet()) {
				if (e.getKey() >= startRegionId && e.getKey() <= endRegionId) {
					dbReference += e.getValue().f0;
					dataSize += e.getValue().f1;
				}
			}
			// we only add fileId in the necessary groups.
			if (dbReference > 0) {
				fileMeta.setUsedDataSizeAndReference(dataSize, dbReference, 0, canDeleteFile);
				resultFileMetas.put(fileId, fileMeta);
			}
		}

		return resultFileMetas;
	}

	/**
	 * merge DB used meta and the meta restored from snapshots' file mapping.
	 */
	private Map mergeDbAndSnapshotFileMeta(
		Map dbUsedFileMeta,
		Map restoredSnapshots,
		boolean canDeleteFile) {
		Map mergedFileMetas = new HashMap<>(dbUsedFileMeta);
		for (SnapshotManager.RestoredSnapshot restoredSnapshot : restoredSnapshots.values()) {
			Map fileMapping = restoredSnapshot.getFileMapping();
			for (Map.Entry entry : fileMapping.entrySet()) {
				int fileId = entry.getKey();
				String path = entry.getValue();
				FileMeta.RestoredFileMeta fileMeta = mergedFileMetas.get(fileId);
				if (fileMeta != null) {
					fileMeta.snapshotReference += 1;
				} else {
					fileMeta = FileMeta.RestoredFileMeta.of(
						fileId, path, 0, 0, 0, 1, canDeleteFile);
					mergedFileMetas.put(fileId, fileMeta);
				}
			}
		}
		return mergedFileMetas;
	}

	private Map getMappingForFileIdToPath(Map restoredFileMetas) {
		Map mapping = new HashMap<>();
		restoredFileMetas.forEach((k, v) -> mapping.put(k, v.filePath));
		return mapping;
	}

	/**
	 * Hard link the restored local file to current working directory,
	 * and update the file path in file meta.
	 */
	private void createHardLinkForRestoredLocalFile(
		String restoredLocalBasePath,
		Map restoredFileMetas,
		Path workingBasePath) throws Exception {

		File restoredLocalBaseDir = new File(new Path(restoredLocalBasePath).toUri().getPath());
		File workingBaseDir = new File(workingBasePath.toUri().getPath());
		if (workingBaseDir.exists()) {
			FileUtils.deleteDirectory(workingBaseDir);
		}
		if (!workingBaseDir.mkdirs()) {
			throw new IOException("Local working directory " + workingBaseDir + " already exists");
		}
		for (Map.Entry entry : restoredFileMetas.entrySet()) {
			FileMeta.RestoredFileMeta fileMeta = entry.getValue();
			String fileName = fileMeta.filePath;
			File src = new File(restoredLocalBaseDir, fileName);
			File target = new File(workingBaseDir, fileName);
			try {
				Files.createLink(target.toPath(), src.toPath());
			} catch (Exception e) {
				LOG.error("Fail to create hard link from {} to {}.", src.getAbsolutePath(), target.getAbsolutePath(), e);
				throw e;
			}
			// replace the file path in file meta
			fileMeta.filePath = target.getAbsolutePath();
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy