All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.state.gemini.engine.GeminiDB Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.state.gemini.engine;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.common.typeutils.base.IntSerializer;
import org.apache.flink.api.common.typeutils.base.LongSerializer;
import org.apache.flink.api.common.typeutils.base.MapSerializer;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.runtime.TupleSerializer;
import org.apache.flink.core.fs.Path;
import org.apache.flink.core.memory.DataInputView;
import org.apache.flink.core.memory.DataInputViewStreamWrapper;
import org.apache.flink.metrics.MetricGroup;
import org.apache.flink.runtime.state.gemini.engine.dbms.GContext;
import org.apache.flink.runtime.state.gemini.engine.dbms.GContextImpl;
import org.apache.flink.runtime.state.gemini.engine.dbms.Supervisor;
import org.apache.flink.runtime.state.gemini.engine.dbms.SupervisorImpl;
import org.apache.flink.runtime.state.gemini.engine.exceptions.GeminiRuntimeException;
import org.apache.flink.runtime.state.gemini.engine.filter.CompositeStateFilter;
import org.apache.flink.runtime.state.gemini.engine.filter.RemoveAllStateFilter;
import org.apache.flink.runtime.state.gemini.engine.filter.TtlStateFilter;
import org.apache.flink.runtime.state.gemini.engine.fs.FileManager;
import org.apache.flink.runtime.state.gemini.engine.fs.FileMeta;
import org.apache.flink.runtime.state.gemini.engine.metrics.CacheMetrics;
import org.apache.flink.runtime.state.gemini.engine.metrics.ExceptionMetrics;
import org.apache.flink.runtime.state.gemini.engine.metrics.FileCacheMetrics;
import org.apache.flink.runtime.state.gemini.engine.metrics.FileCleanerMetrics;
import org.apache.flink.runtime.state.gemini.engine.metrics.GeminiMetrics;
import org.apache.flink.runtime.state.gemini.engine.metrics.HandlerMetrics;
import org.apache.flink.runtime.state.gemini.engine.page.PageIndex;
import org.apache.flink.runtime.state.gemini.engine.page.PageIndexHashImpl;
import org.apache.flink.runtime.state.gemini.engine.snapshot.BackendSnapshotMeta;
import org.apache.flink.runtime.state.gemini.engine.snapshot.DBSnapshotMeta;
import org.apache.flink.runtime.state.gemini.engine.snapshot.DBSnapshotResult;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotCompletableFuture;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotManager;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotMetaFile;
import org.apache.flink.runtime.state.gemini.time.ProcessingTimeProvider;
import org.apache.flink.runtime.state.gemini.time.TimeProvider;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.FileUtils;
import org.apache.flink.util.Preconditions;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Future;

import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * GeminiDB is a new multi-model data store, designed for using in Flink with single thread accessing manner.
 * it depends on all-in-memory-compaction to avoid file-compaction.
 * it supports snapshot/restore,computer-storage-desperation,ttl.
 * one instance with same dbDFSPath in a JVM.
 */
public class GeminiDB {

	private static final Logger LOG = LoggerFactory.getLogger(GeminiDB.class);

	/**
	 * DB status.
	 */
	public enum Status {
		INITIALIZE, OPENED, CLOSING, CLOSED, INTERNAL_ERROR
	}

	private String dbName;

	private Supervisor geminiSupervisor;

	private final Object lock = new Object();

	private volatile Status geminiDBStatus = Status.INITIALIZE;

	private volatile Throwable internalError;

	private final Map geminiTableMap = new ConcurrentHashMap<>();

	private GContext gContext;

	private MetricGroup dbMetricGroup;

	private GConfiguration configuration;

	/**
	 * This constructor is only used for tests.
	 */
	@VisibleForTesting
	public GeminiDB() {
		this.geminiDBStatus = Status.OPENED;
	}

	public GeminiDB(
		String dbName, GConfiguration conf, int sRegionId, int eRegionId, MetricGroup metricGroup) {
		this.dbName = checkNotNull(dbName);
		this.configuration = checkNotNull(conf);
		this.gContext = new GContextImpl(this, sRegionId, eRegionId, conf);

		// TODO how to choose time provider according to TimeCharacteristic
		TimeProvider timeProvider = new ProcessingTimeProvider();
		this.gContext.setTimeProvider(timeProvider);

		// init metrics
		int sampleCount = conf.getMetricSampleCount();
		int histogramWindowSize = conf.getMetricHistogramWindowSize();
		this.dbMetricGroup = metricGroup;
		gContext.setDBMetricGroup(dbMetricGroup);

		MetricGroup fileManagerMetricGroup = dbMetricGroup.addGroup("fileManager");
		gContext.setFileManagerMetricGroup(fileManagerMetricGroup);

		GeminiMetrics geminiMetric = new GeminiMetrics(dbMetricGroup.addGroup("state"),
			sampleCount,
			histogramWindowSize);
		this.gContext.setGeminiMetric(geminiMetric);

		CacheMetrics cacheMetric = new CacheMetrics(dbMetricGroup.addGroup("cache"), sampleCount, histogramWindowSize);
		this.gContext.setCacheMetric(cacheMetric);

		HandlerMetrics handlerMetric = new HandlerMetrics(dbMetricGroup.addGroup("handler"),
			sampleCount,
			histogramWindowSize);
		this.gContext.setHandlerMetric(handlerMetric);

		FileCacheMetrics fileCacheMetrics = new FileCacheMetrics(dbMetricGroup.addGroup("fileCache"), sampleCount, histogramWindowSize);
		gContext.setFileCacheMetrics(fileCacheMetrics);

		FileCleanerMetrics fileCleanerMetrics = new FileCleanerMetrics(dbMetricGroup.addGroup("fileCleaner"), sampleCount, histogramWindowSize);
		gContext.setFileCleanerMetrics(fileCleanerMetrics);

		ExceptionMetrics exceptionMetrics = new ExceptionMetrics(dbMetricGroup.addGroup("exception"), sampleCount, histogramWindowSize);
		gContext.setExceptionMetrics(exceptionMetrics);

		//initialize
		CompositeStateFilter stateFilter = new CompositeStateFilter();
		if (gContext.hasTtl()) {
			stateFilter.addStateFilter(new TtlStateFilter());
		}
		stateFilter.addStateFilter(new RemoveAllStateFilter());
		gContext.setStateFilter(stateFilter);

		this.geminiSupervisor = new SupervisorImpl(gContext);
		this.gContext.setSupervisor(geminiSupervisor);

		handlerMetric.registerMetricsCacheStat(this.geminiSupervisor.getWriteBufferManager());

		LOG.info("GeminiDB is created.");
	}

	public boolean setStatus(Status expected, Status target) {
		synchronized (lock) {
			if (expected != null && geminiDBStatus != expected) {
				return false;
			}
			geminiDBStatus = target;
			return true;
		}
	}

	public Status getStatus() {
		return geminiDBStatus;
	}

	public Throwable getInternalError() {
		return internalError;
	}

	public void setInternalError(Throwable throwable) {
		if (setStatus(Status.OPENED, Status.INTERNAL_ERROR)) {
			internalError = throwable;
		}
	}

	public synchronized void open() {
		if (!setStatus(Status.INITIALIZE, Status.OPENED)) {
			throw new GeminiRuntimeException("open db failed, current status is " + geminiDBStatus.toString());
		}
		geminiSupervisor.start();
		LOG.info("GeminiDB is opened");
	}

	public void startSnapshot(BackendSnapshotMeta backendSnapshotMeta) throws Exception {
		this.geminiSupervisor.startSnapshot(backendSnapshotMeta);
	}

	public Future getSnapshotResult(long checkpointId) {
		CompletableFuture result = new CompletableFuture<>();
		SnapshotManager.PendingSnapshot pendingSnapshot = this.geminiSupervisor.getPendingSnapshot(checkpointId);
		SnapshotCompletableFuture snapshotCompletableFuture = pendingSnapshot.getResultFuture();

		snapshotCompletableFuture.whenCompleteAsync((Boolean sResult, Throwable throwable) -> {
			if (!gContext.isDBNormal()) {
				Throwable dbThrowable = new GeminiRuntimeException("DB is in abnormal status: " + geminiDBStatus.name());
				result.completeExceptionally(dbThrowable);
				gContext.getSupervisor().getSnapshotManager().endSnapshot(checkpointId, dbThrowable);
				return;
			}

			if (throwable != null) {
				result.completeExceptionally(throwable);
				gContext.getSupervisor().getSnapshotManager().endSnapshot(checkpointId, throwable);
				return;
			}

			Throwable snapshotThrowable = null;
			DBSnapshotResult snapshotResult = null;
			try {
				snapshotResult = pendingSnapshot.getSnapshotOperation().getSnapshotResult();
			} catch (Exception e) {
				snapshotThrowable = e;
			} finally {
				try {
					gContext.getSupervisor().getSnapshotManager().endSnapshot(checkpointId, snapshotThrowable);
				} catch (Exception e) {
					snapshotThrowable = ExceptionUtils.firstOrSuppressed(e, snapshotThrowable);
				}
				if (snapshotThrowable == null) {
					result.complete(snapshotResult);
				} else {
					result.completeExceptionally(snapshotThrowable);
				}
			}
		});
		return result;
	}

	public void restoreFromSnapshot(
		List metas,
		Map restoredTables,
		int startRegionId,
		int endRegionId) throws Exception {
		// TODO: #SR first implement restore for failover only.
		// 		- restore meta
		//		- restore index
		//		- restore data (table/region/writebuffer/pagestore)

		// TODO do not consider rescale currently
		Preconditions.checkNotNull(metas.size() == 1, "not consider rescale currently");
		MapSerializer>> fileMappingSerializer = getFileMappingSerializer();

		long restoredCheckpointID = 0;

		// dfs file mapping to restore for DB.
		int dfsFileMappingSize = 0;
		String restoredDfsBasePath = null;
		Map dfsFileIdToPath = new HashMap<>();
		Map>> dfsFileMapping = new HashMap<>();

		boolean hasLocalMeta = false;
		int localFileMappingSize = 0;
		String restoredLocalBasePath = null;
		Map localFileIdToPath = null;
		Map>> localFileMapping = null;

		// TODO #SR error handle.
		LOG.info("Start to restore from snapshot for GeminiDB, metas {}, tables {}, region from {} to {}.", metas, restoredTables, startRegionId, endRegionId);
		for (DBSnapshotMeta meta : metas) {
			restoredCheckpointID = meta.getCheckPointId();
			try (SnapshotMetaFile.Reader reader = SnapshotMetaFile.getReader(new Path(meta.getSnapshotMetaPath()))) {

				// restore copiedPageIndex
				int copiedPageIndexSize = reader.readInt();
				for (int i = 0; i < copiedPageIndexSize; ++i) {
					String tableName = reader.readUTF();

					int regionPageIndexesSize = reader.readInt();
					for (int j = 0; j < regionPageIndexesSize; ++j) {
						int regionCode = reader.readInt();
						GRegionID regionID = new GRegionIDImpl(regionCode);
						long lastSeqID = reader.readLong();
						long removeAllSeqID = reader.readLong();
						GTable table = restoredTables.get(tableName);
						GRegionContext regionContext = new GRegionContext(
							gContext, tableName, regionID, table.getTableDescription().getPageSerde(), lastSeqID, removeAllSeqID);
						PageIndex pageIndex = new PageIndexHashImpl.Builder(reader, regionContext).build();

						if (pageIndex != null) {
							//0 is data. now only support 1 index. TODO support more index.
							if (regionID.getIndexID() == 0) {
								// data region.
								table.setRegion(regionID.getId(),
									table.getTableDescription().createRegion(gContext, table, regionID, pageIndex));
							} else {
								// index region
								table.setIndexRegion(regionID.getId(),
									table.getIndexDescription().createRegion(gContext, table, regionID, pageIndex));
							}
						}
					}
				}

				Preconditions.checkState(reader.readBoolean(), "dfs meta should always has file mapping");
				dfsFileMappingSize = reader.readInt();
				restoredDfsBasePath = reader.readUTF();
				if (dfsFileMappingSize > 0) {
					for (int i = 0; i < dfsFileMappingSize; ++i) {
						String filePath = reader.readUTF();
						Integer id = reader.readInt();
						dfsFileIdToPath.put(id, filePath);
					}
					DataInputView dataInputView = new DataInputViewStreamWrapper(reader);
					dfsFileMapping = fileMappingSerializer.deserialize(dataInputView);
				}

				hasLocalMeta = reader.readBoolean();
				if (hasLocalMeta) {
					localFileMappingSize = reader.readInt();
					restoredLocalBasePath = reader.readUTF();
					if (localFileMappingSize > 0) {
						localFileIdToPath = new HashMap<>();
						for (int i = 0; i < localFileMappingSize; ++i) {
							String filePath = reader.readUTF();
							Integer id = reader.readInt();
							localFileIdToPath.put(id, filePath);
						}
						DataInputView dataInputView = new DataInputViewStreamWrapper(reader);
						localFileMapping = fileMappingSerializer.deserialize(dataInputView);
					}
				}
			}
		}

		Preconditions.checkNotNull(restoredDfsBasePath);

		// TODO: #SR second fix for scale up/down.
		// - need to flush all data out in the first checkpoint after scale up/down.
		//1. key group scalar in or out
		//2. load region index from snapshot
		// startRegionId and endRegionId are same as gContext's value.

		// TODO not consider rescale currently, so it always be false.
		boolean needToBreakLineage = false;

		SnapshotManager snapshotManager = geminiSupervisor.getSnapshotManager();
		snapshotManager.setNeedToBreakLineage(needToBreakLineage);
		Map restoredSnapshots =
			snapshotManager.restore(restoredCheckpointID, dfsFileIdToPath, restoredDfsBasePath);

		FileManager dfsFileManager = gContext.getSupervisor().getDfsFileManager();
		Map dbUsedFileMeta = dfsFileMappingSize == 0 ? new HashMap<>() :
			getRestoredFileMetaUsedByDB(dfsFileIdToPath, dfsFileMapping, !needToBreakLineage);
		dfsFileManager.restore(mergeDbAndSnapshotFileMeta(dbUsedFileMeta, restoredSnapshots, !needToBreakLineage));

		if (hasLocalMeta && localFileMappingSize > 0) {
			FileManager localFileManager = gContext.getSupervisor().getLocalFileManager();
			Map newLocalFileIdToPath = restoreLocalFile(
				restoredLocalBasePath, localFileIdToPath, localFileManager.getBasePath());
			// currently local state use hard link, and the restored files should always
			// be deleted by the current DB.
			localFileManager.restore(getRestoredFileMetaUsedByDB(newLocalFileIdToPath,
				localFileMapping,
				true));
		}

		LOG.info("Restored successfully from {} for {}, region {} to {}.", metas, restoredTables, startRegionId, endRegionId);
	}

	public GTable getTableOrCreate(GTableDescription tableDescription) throws GeminiRuntimeException {
		return geminiTableMap.computeIfAbsent(tableDescription.getTableName(), (nothing) -> {
			GTable gTable = tableDescription.createTable(gContext);
			this.geminiSupervisor.getCacheManager().addTable(gTable);
			this.geminiSupervisor.getWriteBufferManager().addTableNum(tableDescription.getTableName());
			return gTable;
		});

	}

	public void close() {
		setStatus(null, Status.CLOSED);
		this.geminiSupervisor.close();
		LOG.info("GeminiDB is closed");
	}

	public GContext getGContext() {
		return gContext;
	}

	public GConfiguration getConfiguration() {
		return configuration;
	}

	public Map getGeminiTableMap() {
		return geminiTableMap;
	}

	@SuppressWarnings("unchecked")
	private MapSerializer>> getFileMappingSerializer() {
		TupleSerializer> tuple2Serializer = new TupleSerializer<>(
			(Class>) (Class) Tuple2.class,
			new TypeSerializer[]{IntSerializer.INSTANCE, LongSerializer.INSTANCE}
		);
		MapSerializer> groupMapSerializer = new MapSerializer<>(
			IntSerializer.INSTANCE, tuple2Serializer);
		return new MapSerializer<>(IntSerializer.INSTANCE, groupMapSerializer);
	}

	private Map getRestoredFileMetaUsedByDB(
		Map fileIDToPath,
		Map>> fileMapping,
		boolean canDeleteFile) {
		Map restoredFileMetas = new HashMap<>(fileIDToPath.size());
		for (Map.Entry>> entry : fileMapping.entrySet()) {
			int id = entry.getKey();
			String filePath = fileIDToPath.get(id);
			long dataSize = 0;
			int dbReference = 0;
			Map> groupInfo = entry.getValue();
			for (Map.Entry> e : groupInfo.entrySet()) {
				dbReference += e.getValue().f0;
				dataSize += e.getValue().f1;
			}
			long fileSize = 0;
			// TODO should we get file size here.
			FileMeta.RestoredFileMeta meta = FileMeta.RestoredFileMeta.of(
					id,
					filePath,
					fileSize,
					dataSize,
					dbReference,
					0,
					canDeleteFile);
			restoredFileMetas.put(id, meta);
		}
		return restoredFileMetas;
	}

	private Map mergeDbAndSnapshotFileMeta(
		Map dbUsedFileMeta,
		Map restoredSnapshots,
		boolean canDeleteFile) {
		Map mergedFileMetas = new HashMap<>(dbUsedFileMeta);
		for (SnapshotManager.RestoredSnapshot restoredSnapshot : restoredSnapshots.values()) {
			Map fileMapping = restoredSnapshot.getFileMapping();
			for (Map.Entry entry : fileMapping.entrySet()) {
				int fileId = entry.getKey();
				String path = entry.getValue();
				FileMeta.RestoredFileMeta fileMeta = mergedFileMetas.get(fileId);
				if (fileMeta != null) {
					fileMeta.snapshotReference += 1;
				} else {
					fileMeta = FileMeta.RestoredFileMeta.of(
						fileId, path, 0, 0, 0, 1, canDeleteFile);
					mergedFileMetas.put(fileId, fileMeta);
				}
			}
		}
		return mergedFileMetas;
	}

	private Map restoreLocalFile(
		String restoredLocalBasePath,
		Map localFileIdToPath,
		Path workingBasePath) throws Exception {
		Map newLocalFileIdToPath = new HashMap<>();
		File restoredLocalBaseDir = new File(new Path(restoredLocalBasePath).toUri().getPath());
		File workingBaseDir = new File(workingBasePath.toUri().getPath());
		if (workingBaseDir.exists()) {
			FileUtils.deleteDirectory(workingBaseDir);
		}
		if (!workingBaseDir.mkdirs()) {
			throw new IOException("Local working directory for  already exists: " + workingBaseDir);
		}
		for (Map.Entry entry : localFileIdToPath.entrySet()) {
			int fileId = entry.getKey();
			String fileName = entry.getValue();
			File src = new File(restoredLocalBaseDir, fileName);
			File target = new File(workingBaseDir, fileName);
			try {
				Files.createLink(target.toPath(), src.toPath());
			} catch (Exception e) {
				LOG.error("Fail to create hard link from {} to {}, {}", src.getAbsolutePath(), target.getAbsolutePath(), e);
				throw e;
			}
			newLocalFileIdToPath.put(fileId, target.getAbsolutePath());
		}
		return newLocalFileIdToPath;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy