Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.flink.runtime.state.gemini.engine.GeminiDB Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.state.gemini.engine;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.common.typeutils.base.IntSerializer;
import org.apache.flink.api.common.typeutils.base.LongSerializer;
import org.apache.flink.api.common.typeutils.base.MapSerializer;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.runtime.TupleSerializer;
import org.apache.flink.core.fs.Path;
import org.apache.flink.core.memory.DataInputView;
import org.apache.flink.core.memory.DataInputViewStreamWrapper;
import org.apache.flink.metrics.MetricGroup;
import org.apache.flink.runtime.state.gemini.engine.dbms.GContext;
import org.apache.flink.runtime.state.gemini.engine.dbms.GContextImpl;
import org.apache.flink.runtime.state.gemini.engine.dbms.Supervisor;
import org.apache.flink.runtime.state.gemini.engine.dbms.SupervisorImpl;
import org.apache.flink.runtime.state.gemini.engine.exceptions.GeminiRuntimeException;
import org.apache.flink.runtime.state.gemini.engine.filter.CompositeStateFilter;
import org.apache.flink.runtime.state.gemini.engine.filter.RemoveAllStateFilter;
import org.apache.flink.runtime.state.gemini.engine.filter.TtlStateFilter;
import org.apache.flink.runtime.state.gemini.engine.fs.FileManager;
import org.apache.flink.runtime.state.gemini.engine.fs.FileMeta;
import org.apache.flink.runtime.state.gemini.engine.metrics.CacheMetrics;
import org.apache.flink.runtime.state.gemini.engine.metrics.ExceptionMetrics;
import org.apache.flink.runtime.state.gemini.engine.metrics.FileCacheMetrics;
import org.apache.flink.runtime.state.gemini.engine.metrics.FileCleanerMetrics;
import org.apache.flink.runtime.state.gemini.engine.metrics.GeminiMetrics;
import org.apache.flink.runtime.state.gemini.engine.metrics.HandlerMetrics;
import org.apache.flink.runtime.state.gemini.engine.page.PageIndex;
import org.apache.flink.runtime.state.gemini.engine.page.PageIndexHashImpl;
import org.apache.flink.runtime.state.gemini.engine.snapshot.BackendSnapshotMeta;
import org.apache.flink.runtime.state.gemini.engine.snapshot.DBSnapshotMeta;
import org.apache.flink.runtime.state.gemini.engine.snapshot.DBSnapshotResult;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotCompletableFuture;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotManager;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotMetaFile;
import org.apache.flink.runtime.state.gemini.time.ProcessingTimeProvider;
import org.apache.flink.runtime.state.gemini.time.TimeProvider;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.FileUtils;
import org.apache.flink.util.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Future;
import static org.apache.flink.util.Preconditions.checkNotNull;
/**
* GeminiDB is a new multi-model data store, designed for using in Flink with single thread accessing manner.
* it depends on all-in-memory-compaction to avoid file-compaction.
* it supports snapshot/restore,computer-storage-desperation,ttl.
* one instance with same dbDFSPath in a JVM.
*/
public class GeminiDB {
private static final Logger LOG = LoggerFactory.getLogger(GeminiDB.class);
/**
* DB status.
*/
public enum Status {
INITIALIZE, OPENED, CLOSING, CLOSED, INTERNAL_ERROR
}
private String dbName;
private Supervisor geminiSupervisor;
private final Object lock = new Object();
private volatile Status geminiDBStatus = Status.INITIALIZE;
private volatile Throwable internalError;
private final Map geminiTableMap = new ConcurrentHashMap<>();
private GContext gContext;
private MetricGroup dbMetricGroup;
private GConfiguration configuration;
/**
* This constructor is only used for tests.
*/
@VisibleForTesting
public GeminiDB() {
this.geminiDBStatus = Status.OPENED;
}
public GeminiDB(
String dbName, GConfiguration conf, int sRegionId, int eRegionId, MetricGroup metricGroup) {
this.dbName = checkNotNull(dbName);
this.configuration = checkNotNull(conf);
this.gContext = new GContextImpl(this, sRegionId, eRegionId, conf);
// TODO how to choose time provider according to TimeCharacteristic
TimeProvider timeProvider = new ProcessingTimeProvider();
this.gContext.setTimeProvider(timeProvider);
// init metrics
int sampleCount = conf.getMetricSampleCount();
int histogramWindowSize = conf.getMetricHistogramWindowSize();
this.dbMetricGroup = metricGroup;
gContext.setDBMetricGroup(dbMetricGroup);
MetricGroup fileManagerMetricGroup = dbMetricGroup.addGroup("fileManager");
gContext.setFileManagerMetricGroup(fileManagerMetricGroup);
GeminiMetrics geminiMetric = new GeminiMetrics(dbMetricGroup.addGroup("state"),
sampleCount,
histogramWindowSize);
this.gContext.setGeminiMetric(geminiMetric);
CacheMetrics cacheMetric = new CacheMetrics(dbMetricGroup.addGroup("cache"), sampleCount, histogramWindowSize);
this.gContext.setCacheMetric(cacheMetric);
HandlerMetrics handlerMetric = new HandlerMetrics(dbMetricGroup.addGroup("handler"),
sampleCount,
histogramWindowSize);
this.gContext.setHandlerMetric(handlerMetric);
FileCacheMetrics fileCacheMetrics = new FileCacheMetrics(dbMetricGroup.addGroup("fileCache"), sampleCount, histogramWindowSize);
gContext.setFileCacheMetrics(fileCacheMetrics);
FileCleanerMetrics fileCleanerMetrics = new FileCleanerMetrics(dbMetricGroup.addGroup("fileCleaner"), sampleCount, histogramWindowSize);
gContext.setFileCleanerMetrics(fileCleanerMetrics);
ExceptionMetrics exceptionMetrics = new ExceptionMetrics(dbMetricGroup.addGroup("exception"), sampleCount, histogramWindowSize);
gContext.setExceptionMetrics(exceptionMetrics);
//initialize
CompositeStateFilter stateFilter = new CompositeStateFilter();
if (gContext.hasTtl()) {
stateFilter.addStateFilter(new TtlStateFilter());
}
stateFilter.addStateFilter(new RemoveAllStateFilter());
gContext.setStateFilter(stateFilter);
this.geminiSupervisor = new SupervisorImpl(gContext);
this.gContext.setSupervisor(geminiSupervisor);
handlerMetric.registerMetricsCacheStat(this.geminiSupervisor.getWriteBufferManager());
LOG.info("GeminiDB is created.");
}
public boolean setStatus(Status expected, Status target) {
synchronized (lock) {
if (expected != null && geminiDBStatus != expected) {
return false;
}
geminiDBStatus = target;
return true;
}
}
public Status getStatus() {
return geminiDBStatus;
}
public Throwable getInternalError() {
return internalError;
}
public void setInternalError(Throwable throwable) {
if (setStatus(Status.OPENED, Status.INTERNAL_ERROR)) {
internalError = throwable;
}
}
public synchronized void open() {
if (!setStatus(Status.INITIALIZE, Status.OPENED)) {
throw new GeminiRuntimeException("open db failed, current status is " + geminiDBStatus.toString());
}
geminiSupervisor.start();
LOG.info("GeminiDB is opened");
}
public void startSnapshot(BackendSnapshotMeta backendSnapshotMeta) throws Exception {
this.geminiSupervisor.startSnapshot(backendSnapshotMeta);
}
public Future getSnapshotResult(long checkpointId) {
CompletableFuture result = new CompletableFuture<>();
SnapshotManager.PendingSnapshot pendingSnapshot = this.geminiSupervisor.getPendingSnapshot(checkpointId);
SnapshotCompletableFuture snapshotCompletableFuture = pendingSnapshot.getResultFuture();
snapshotCompletableFuture.whenCompleteAsync((Boolean sResult, Throwable throwable) -> {
if (!gContext.isDBNormal()) {
Throwable dbThrowable = new GeminiRuntimeException("DB is in abnormal status: " + geminiDBStatus.name());
result.completeExceptionally(dbThrowable);
gContext.getSupervisor().getSnapshotManager().endSnapshot(checkpointId, dbThrowable);
return;
}
if (throwable != null) {
result.completeExceptionally(throwable);
gContext.getSupervisor().getSnapshotManager().endSnapshot(checkpointId, throwable);
return;
}
Throwable snapshotThrowable = null;
DBSnapshotResult snapshotResult = null;
try {
snapshotResult = pendingSnapshot.getSnapshotOperation().getSnapshotResult();
} catch (Exception e) {
snapshotThrowable = e;
} finally {
try {
gContext.getSupervisor().getSnapshotManager().endSnapshot(checkpointId, snapshotThrowable);
} catch (Exception e) {
snapshotThrowable = ExceptionUtils.firstOrSuppressed(e, snapshotThrowable);
}
if (snapshotThrowable == null) {
result.complete(snapshotResult);
} else {
result.completeExceptionally(snapshotThrowable);
}
}
});
return result;
}
public void restoreFromSnapshot(
List metas,
Map restoredTables,
int startRegionId,
int endRegionId) throws Exception {
// TODO: #SR first implement restore for failover only.
// - restore meta
// - restore index
// - restore data (table/region/writebuffer/pagestore)
// TODO do not consider rescale currently
Preconditions.checkNotNull(metas.size() == 1, "not consider rescale currently");
MapSerializer>> fileMappingSerializer = getFileMappingSerializer();
long restoredCheckpointID = 0;
// dfs file mapping to restore for DB.
int dfsFileMappingSize = 0;
String restoredDfsBasePath = null;
Map dfsFileIdToPath = new HashMap<>();
Map>> dfsFileMapping = new HashMap<>();
boolean hasLocalMeta = false;
int localFileMappingSize = 0;
String restoredLocalBasePath = null;
Map localFileIdToPath = null;
Map>> localFileMapping = null;
// TODO #SR error handle.
LOG.info("Start to restore from snapshot for GeminiDB, metas {}, tables {}, region from {} to {}.", metas, restoredTables, startRegionId, endRegionId);
for (DBSnapshotMeta meta : metas) {
restoredCheckpointID = meta.getCheckPointId();
try (SnapshotMetaFile.Reader reader = SnapshotMetaFile.getReader(new Path(meta.getSnapshotMetaPath()))) {
// restore copiedPageIndex
int copiedPageIndexSize = reader.readInt();
for (int i = 0; i < copiedPageIndexSize; ++i) {
String tableName = reader.readUTF();
int regionPageIndexesSize = reader.readInt();
for (int j = 0; j < regionPageIndexesSize; ++j) {
int regionCode = reader.readInt();
GRegionID regionID = new GRegionIDImpl(regionCode);
long lastSeqID = reader.readLong();
long removeAllSeqID = reader.readLong();
GTable table = restoredTables.get(tableName);
GRegionContext regionContext = new GRegionContext(
gContext, tableName, regionID, table.getTableDescription().getPageSerde(), lastSeqID, removeAllSeqID);
PageIndex pageIndex = new PageIndexHashImpl.Builder(reader, regionContext).build();
if (pageIndex != null) {
//0 is data. now only support 1 index. TODO support more index.
if (regionID.getIndexID() == 0) {
// data region.
table.setRegion(regionID.getId(),
table.getTableDescription().createRegion(gContext, table, regionID, pageIndex));
} else {
// index region
table.setIndexRegion(regionID.getId(),
table.getIndexDescription().createRegion(gContext, table, regionID, pageIndex));
}
}
}
}
Preconditions.checkState(reader.readBoolean(), "dfs meta should always has file mapping");
dfsFileMappingSize = reader.readInt();
restoredDfsBasePath = reader.readUTF();
if (dfsFileMappingSize > 0) {
for (int i = 0; i < dfsFileMappingSize; ++i) {
String filePath = reader.readUTF();
Integer id = reader.readInt();
dfsFileIdToPath.put(id, filePath);
}
DataInputView dataInputView = new DataInputViewStreamWrapper(reader);
dfsFileMapping = fileMappingSerializer.deserialize(dataInputView);
}
hasLocalMeta = reader.readBoolean();
if (hasLocalMeta) {
localFileMappingSize = reader.readInt();
restoredLocalBasePath = reader.readUTF();
if (localFileMappingSize > 0) {
localFileIdToPath = new HashMap<>();
for (int i = 0; i < localFileMappingSize; ++i) {
String filePath = reader.readUTF();
Integer id = reader.readInt();
localFileIdToPath.put(id, filePath);
}
DataInputView dataInputView = new DataInputViewStreamWrapper(reader);
localFileMapping = fileMappingSerializer.deserialize(dataInputView);
}
}
}
}
Preconditions.checkNotNull(restoredDfsBasePath);
// TODO: #SR second fix for scale up/down.
// - need to flush all data out in the first checkpoint after scale up/down.
//1. key group scalar in or out
//2. load region index from snapshot
// startRegionId and endRegionId are same as gContext's value.
// TODO not consider rescale currently, so it always be false.
boolean needToBreakLineage = false;
SnapshotManager snapshotManager = geminiSupervisor.getSnapshotManager();
snapshotManager.setNeedToBreakLineage(needToBreakLineage);
Map restoredSnapshots =
snapshotManager.restore(restoredCheckpointID, dfsFileIdToPath, restoredDfsBasePath);
FileManager dfsFileManager = gContext.getSupervisor().getDfsFileManager();
Map dbUsedFileMeta = dfsFileMappingSize == 0 ? new HashMap<>() :
getRestoredFileMetaUsedByDB(dfsFileIdToPath, dfsFileMapping, !needToBreakLineage);
dfsFileManager.restore(mergeDbAndSnapshotFileMeta(dbUsedFileMeta, restoredSnapshots, !needToBreakLineage));
if (hasLocalMeta && localFileMappingSize > 0) {
FileManager localFileManager = gContext.getSupervisor().getLocalFileManager();
Map newLocalFileIdToPath = restoreLocalFile(
restoredLocalBasePath, localFileIdToPath, localFileManager.getBasePath());
// currently local state use hard link, and the restored files should always
// be deleted by the current DB.
localFileManager.restore(getRestoredFileMetaUsedByDB(newLocalFileIdToPath,
localFileMapping,
true));
}
LOG.info("Restored successfully from {} for {}, region {} to {}.", metas, restoredTables, startRegionId, endRegionId);
}
public GTable getTableOrCreate(GTableDescription tableDescription) throws GeminiRuntimeException {
return geminiTableMap.computeIfAbsent(tableDescription.getTableName(), (nothing) -> {
GTable gTable = tableDescription.createTable(gContext);
this.geminiSupervisor.getCacheManager().addTable(gTable);
this.geminiSupervisor.getWriteBufferManager().addTableNum(tableDescription.getTableName());
return gTable;
});
}
public void close() {
setStatus(null, Status.CLOSED);
this.geminiSupervisor.close();
LOG.info("GeminiDB is closed");
}
public GContext getGContext() {
return gContext;
}
public GConfiguration getConfiguration() {
return configuration;
}
public Map getGeminiTableMap() {
return geminiTableMap;
}
@SuppressWarnings("unchecked")
private MapSerializer>> getFileMappingSerializer() {
TupleSerializer> tuple2Serializer = new TupleSerializer<>(
(Class>) (Class) Tuple2.class,
new TypeSerializer[]{IntSerializer.INSTANCE, LongSerializer.INSTANCE}
);
MapSerializer> groupMapSerializer = new MapSerializer<>(
IntSerializer.INSTANCE, tuple2Serializer);
return new MapSerializer<>(IntSerializer.INSTANCE, groupMapSerializer);
}
private Map getRestoredFileMetaUsedByDB(
Map fileIDToPath,
Map>> fileMapping,
boolean canDeleteFile) {
Map restoredFileMetas = new HashMap<>(fileIDToPath.size());
for (Map.Entry>> entry : fileMapping.entrySet()) {
int id = entry.getKey();
String filePath = fileIDToPath.get(id);
long dataSize = 0;
int dbReference = 0;
Map> groupInfo = entry.getValue();
for (Map.Entry> e : groupInfo.entrySet()) {
dbReference += e.getValue().f0;
dataSize += e.getValue().f1;
}
long fileSize = 0;
// TODO should we get file size here.
FileMeta.RestoredFileMeta meta = FileMeta.RestoredFileMeta.of(
id,
filePath,
fileSize,
dataSize,
dbReference,
0,
canDeleteFile);
restoredFileMetas.put(id, meta);
}
return restoredFileMetas;
}
private Map mergeDbAndSnapshotFileMeta(
Map dbUsedFileMeta,
Map restoredSnapshots,
boolean canDeleteFile) {
Map mergedFileMetas = new HashMap<>(dbUsedFileMeta);
for (SnapshotManager.RestoredSnapshot restoredSnapshot : restoredSnapshots.values()) {
Map fileMapping = restoredSnapshot.getFileMapping();
for (Map.Entry entry : fileMapping.entrySet()) {
int fileId = entry.getKey();
String path = entry.getValue();
FileMeta.RestoredFileMeta fileMeta = mergedFileMetas.get(fileId);
if (fileMeta != null) {
fileMeta.snapshotReference += 1;
} else {
fileMeta = FileMeta.RestoredFileMeta.of(
fileId, path, 0, 0, 0, 1, canDeleteFile);
mergedFileMetas.put(fileId, fileMeta);
}
}
}
return mergedFileMetas;
}
private Map restoreLocalFile(
String restoredLocalBasePath,
Map localFileIdToPath,
Path workingBasePath) throws Exception {
Map newLocalFileIdToPath = new HashMap<>();
File restoredLocalBaseDir = new File(new Path(restoredLocalBasePath).toUri().getPath());
File workingBaseDir = new File(workingBasePath.toUri().getPath());
if (workingBaseDir.exists()) {
FileUtils.deleteDirectory(workingBaseDir);
}
if (!workingBaseDir.mkdirs()) {
throw new IOException("Local working directory for already exists: " + workingBaseDir);
}
for (Map.Entry entry : localFileIdToPath.entrySet()) {
int fileId = entry.getKey();
String fileName = entry.getValue();
File src = new File(restoredLocalBaseDir, fileName);
File target = new File(workingBaseDir, fileName);
try {
Files.createLink(target.toPath(), src.toPath());
} catch (Exception e) {
LOG.error("Fail to create hard link from {} to {}, {}", src.getAbsolutePath(), target.getAbsolutePath(), e);
throw e;
}
newLocalFileIdToPath.put(fileId, target.getAbsolutePath());
}
return newLocalFileIdToPath;
}
}