All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.state.gemini.engine.memstore.AbstractWriteBuffer Maven / Gradle / Ivy

There is a newer version: 1.5.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.state.gemini.engine.memstore;

import org.apache.flink.runtime.state.gemini.engine.GRegion;
import org.apache.flink.runtime.state.gemini.engine.GRegionContext;
import org.apache.flink.runtime.state.gemini.engine.dbms.GContext;
import org.apache.flink.runtime.state.gemini.engine.exceptions.GeminiRuntimeException;
import org.apache.flink.runtime.state.gemini.engine.exceptions.GeminiShutDownException;
import org.apache.flink.runtime.state.gemini.engine.filecache.FileCache;
import org.apache.flink.runtime.state.gemini.engine.filecache.PageBatchFlusher;
import org.apache.flink.runtime.state.gemini.engine.handler.PageHandler;
import org.apache.flink.runtime.state.gemini.engine.page.DataPage;
import org.apache.flink.runtime.state.gemini.engine.page.PageAddress;
import org.apache.flink.runtime.state.gemini.engine.page.PageIndex;
import org.apache.flink.runtime.state.gemini.engine.page.PageStore;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotCompletableFuture;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotManager;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotOperation;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotStat;
import org.apache.flink.util.Preconditions;

import org.apache.flink.shaded.netty4.io.netty.util.concurrent.EventExecutor;
import org.apache.flink.shaded.netty4.io.netty.util.concurrent.EventExecutorGroup;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;

/**
 * AbstractWriteBuffer.
 */
public abstract class AbstractWriteBuffer implements WriteBuffer {
	private static final Logger LOG = LoggerFactory.getLogger(AbstractWriteBuffer.class);
	protected final EventExecutor eventExecutor;
	protected final GRegionContext gRegionContext;
	protected long segmentID = 0;
	protected final GRegion gRegion;
	protected final PageStore pageStore;
	private final WriteBufferManager writeBufferManager;
	private long printTS = System.currentTimeMillis();
	private CompletableFuture lastFuture;

	public AbstractWriteBuffer(
		GRegion gRegion, EventExecutor eventExecutor, PageStore pageStore) {
		this.gRegionContext = gRegion.getGRegionContext();
		this.gRegion = gRegion;
		this.eventExecutor = eventExecutor;
		this.pageStore = pageStore;
		this.writeBufferManager = gRegionContext.getGContext().getSupervisor().getWriteBufferManager();
	}

	@Override
	public EventExecutor getExecutor() {
		return this.eventExecutor;
	}

	void checkResource() {
		//1. check if snapshot happened
		//2. check WriteBufferStats to decide if cut-off segment.
		//3. ask WriteBufferManager to block.
		//no estimated size now.
		long startTime = System.currentTimeMillis();
		if (printTS + 60000 < startTime) {
			printTS = startTime;
			if (LOG.isDebugEnabled()) {
				LOG.debug("writeBufferStats: {}", gRegionContext.getWriteBufferStats());
				LOG.debug("pageStoreStats: {}", gRegionContext.getPageStoreStats());
			}
		}
		if (gRegionContext.getWriteBufferStats().getAverageKeyLen() < 0) {
			if (gRegionContext.getWriteBufferStats().getAverageKeyLen() == -2) {
				//launch a estimateHandler
				lastFuture = new CompletableFuture();
				gRegionContext.getWriteBufferStats().setAverageKeyLen(-1);
				final Segment segmentCopy = getActiveSegment().copySegment();
				eventExecutor.execute(() -> {
					createPageHandler(segmentCopy, true).handle();
					lastFuture.complete(null);
				});
			}

			if (getActiveSegment().getRecordCount() > 1000) {
				if (!lastFuture.isDone()) {
					try {
						lastFuture.get(10, TimeUnit.MILLISECONDS);
					} catch (Exception e) {
						return;
					}
				}
			} else {
				return;
			}
		}

		//estimating MapValue and List based on it's element.
		int writeBufferEstimatedSize = getEstimatedSize(getActiveSegment().getRecordCount());
		int totalWriteBufferEstimatedSize = writeBufferEstimatedSize + getEstimatedSize(writeBufferManager.getTotalRecordCount());

		if (writeBufferEstimatedSize < gRegionContext.getWriteBufferWaterMark()) {
			if (totalWriteBufferEstimatedSize < writeBufferManager.getTotalMemSize() || !writeBufferManager.isBestChoiceWriteBufferFlushing(
				this)) {
				return;
			}
		}

		GContext gContext = gRegionContext.getGContext();
		while (gRegionContext.getWriteBufferStats().getFlushingSegmentCount() >= gContext.getGConfiguration().getNumFlushingSegment()) {
			gContext.checkDBStatus();
			if (writeBufferManager.canFlushWriteBuffer(this)) {
				break;
			}
			synchronized (this) {
				try {
					wait(1);
					//Let's see how much is it. if too much, then change it.
					writeBufferManager.increaseWriteBufferFlushBlock();
				} catch (InterruptedException e) {
					throw new GeminiRuntimeException(e);
				}
			}
		}

		long waitTime = System.currentTimeMillis() - startTime;
		if (waitTime > 10) {
			LOG.info("too much flushing segment or evict too long, wait time ={} ...", waitTime);
		}

		Segment rs = addFlushingSegment();
		gRegionContext.getWriteBufferStats().addTotalFlushingRecordCount(rs.getRecordCount());
		gRegionContext.getWriteBufferStats().addTotalRecordCount(-rs.getRecordCount());
		doSegmentFlush(rs, writeBufferEstimatedSize);
	}

	void doSegmentFlush(Segment segment, int estimatedSize) {
		eventExecutor.execute(() -> {
			try {
				PageHandler pageHandler = createPageHandler(segment, false);
				pageHandler.handle();
				//TODO to handle exception, add mechanism to re-flush this segment.
				endSegmentFlush(segment.getSegmentID());
			} catch (GeminiShutDownException ignore) {
				LOG.debug("gemini has shutdown", ignore);
			} catch (Exception e) {
				LOG.error("Internal Bug. Flush segment failed", e);
				gRegionContext.getGContext().setDBInternalError(e);
			}
		});
	}

	private void endSegmentFlush(long segmentID) {
		Segment segment = pollFlushingSegment();
		Preconditions.checkArgument(segment != null, "error segment!");
		Preconditions.checkArgument(segment.getSegmentID() == segmentID, "error segment!");
		gRegionContext.getWriteBufferStats().addFlushingSegmentCount(-1);
		gRegionContext.getWriteBufferStats().addTotalFlushingRecordCount(-segment.getRecordCount());
		synchronized (this) {
			notify();
		}
	}

	public abstract Segment getActiveSegment();

	abstract Segment addFlushingSegment();

	abstract Segment pollFlushingSegment();

	abstract PageHandler createPageHandler(Segment segment, boolean onlyEstimatedSize);

	private int getEstimatedSize(long elementSize) {
		if (gRegionContext.getWriteBufferStats().getAverageKeyLen() <= 0) {
			LOG.error("Let's see whether it will happen!");
			return 0;
		}
		return (int) (gRegionContext.getPageStoreStats().getPageSizeRate() * (gRegionContext.getWriteBufferStats().getAverageKeyLen() + gRegionContext.getWriteBufferStats().getAverageValueLen()) * elementSize);
	}

	@Override
	public void doSnapshot(SnapshotOperation snapshotOperation) {

		SnapshotManager.PendingSnapshot pendingSnapshot = snapshotOperation.getPendingSnapshot();
		long checkpointId = pendingSnapshot.getCheckpointId();
		SnapshotCompletableFuture snapshotCompletableFuture = pendingSnapshot.getResultFuture();
		if (snapshotCompletableFuture.isEndSnapshot()) {
			return;
		}
		boolean isLocalSnapshotEnabled = gRegionContext.getGContext().getGConfiguration().isLocalSnapshotEnabled();
		snapshotCompletableFuture.incRunningTask();

		try {
			Segment rs = addFlushingSegment();
			PageHandler pageHandler = createPageHandler(rs, false);
			gRegionContext.getWriteBufferStats().addTotalFlushingRecordCount(rs.getRecordCount());
			gRegionContext.getWriteBufferStats().addTotalRecordCount(-rs.getRecordCount());

			// segment to page.
			if (LOG.isDebugEnabled()) {
				LOG.debug("Start to snapshot write buffer for {}.", checkpointId);
			}
			// TODO #SR error handle and UT
			eventExecutor.execute(() -> {
				final Map copiedReferencedDataPage = new HashMap<>();
				try {
					pageHandler.handle();
					endSegmentFlush(rs.getSegmentID());
					// copy page index
					PageIndex copyPageIndex = pageStore.getPageIndex().copy(copiedReferencedDataPage);
					pendingSnapshot.addGRegionSnapshotMeta(gRegionContext, copyPageIndex);

					if (LOG.isDebugEnabled()) {
						LOG.debug("Segment flush and pageIndex copy done for {}, will start to flush.", checkpointId);
					}
					// increase running task for the submitted task.
					snapshotCompletableFuture.incRunningTask();
					gRegionContext.getGContext().getSupervisor().getSnapshotManager().getSnapshotExecutor().execute(() -> {
						// because all region will flush pages in snapshotExecutor, so it's thread safe for flusher
						PageBatchFlusher flusher = pendingSnapshot.getDfsPageBatchFlusher();
						EventExecutorGroup flushExecutorGroup = gRegionContext.getGContext().getSupervisor().getFlushExecutorGroup();
						FileCache fileCache = gRegionContext.getGContext().getSupervisor().getFileCache();
						try {
							Iterator pageIterator = copyPageIndex.pageIterator();
							int totalPage = 0, totalLocalPage = 0;
							long totalSize = 0, totalLocalSize = 0;
							int incrementalPages = 0, localIncrementalPages = 0;
							long incrementalSize = 0, localIncrementalSize = 0;
							while (gRegionContext.getGContext().isDBNormal() && pageIterator.hasNext() && !snapshotCompletableFuture.isEndSnapshot()) {
								PageAddress pageAddress = pageIterator.next();
								try {
									EventExecutor flushEventExecutor = flushExecutorGroup.next();
									snapshotCompletableFuture.incRunningTask();

									// NOTE When local recovery is enable, it's better to snapshot local before DFS.
									// The reason is that when flush page to DFS, it will be flushed to local if local
									// is invalid to avoid EOF from DFS, and this is done in InfiniteFileCache#flushBatchPages,
									// so a page may be flushed twice by local snapshot and dfs snapshot. Because local
									// IO is faster than DFS, so I think it's better to snapshot local so that local
									// address may have ben valid when snapshot dfs.
									// TODO should we use snapshotEventExecutor or flushEventExecutor to snapshot local
									if (isLocalSnapshotEnabled) {
										snapshotCompletableFuture.incRunningTask();
										if (!pageAddress.isLocalValid()) {
											++localIncrementalPages;
											localIncrementalSize += pageAddress.getDataLen();
										}
										fileCache.addPage(pageAddress,
											gRegionContext,
											flushEventExecutor,
											(success, throwable) -> {
												if (!success) {
													LOG.error("Write error when snapshot local", throwable);
													snapshotCompletableFuture.setEndSnapshot();
													snapshotCompletableFuture.completeExceptionally(throwable);
												}
												snapshotCompletableFuture.decRunningTask();
											});
										++totalLocalPage;
										totalLocalSize += pageAddress.getDataLen();
									}

									if (snapshotOperation.isForceFlushPage() || !pageAddress.isDfsValid()) {
										++incrementalPages;
										incrementalSize += pageAddress.getDataLen();
									} else {
										pendingSnapshot.getSnapshotCompaction().recordSharedPage(pageAddress);
									}
									flusher.addPage(pageAddress, gRegionContext,
										(success, throwable) -> {
											if (!success) {
												LOG.error("Write error when snapshot dfs", throwable);
												snapshotCompletableFuture.setEndSnapshot();
												snapshotCompletableFuture.completeExceptionally(throwable);
											}
											snapshotCompletableFuture.decRunningTask();
										});

									// dfs statistics
									++totalPage;
									totalSize += pageAddress.getDataLen();

								} finally {
									//best effort to fast free space
									DataPage dataPage = copiedReferencedDataPage.remove(pageAddress);
									if (dataPage != null) {
										dataPage.release();
									}
								}

							}
							SnapshotStat snapshotStat = pendingSnapshot.getSnapshotStat();
							snapshotStat.addAndGetTotalPages(totalPage);
							snapshotStat.addAndGetTotalSize(totalSize);
							snapshotStat.addAndGetIncrementalPages(incrementalPages);
							snapshotStat.addAndGetIncrementalSize(incrementalSize);
							snapshotStat.addAndGetTotalLocalPages(totalLocalPage);
							snapshotStat.addAndGetTotalLocalSize(totalLocalSize);
							snapshotStat.addAndGetLocalIncrementalPages(localIncrementalPages);
							snapshotStat.addAndGetLocalIncrementalSize(localIncrementalSize);
						} finally {
							flusher.flush();
							snapshotCompletableFuture.decRunningTask();
							copiedReferencedDataPage.values().forEach(dataPage -> dataPage.release());
						}
					});
				} catch (Exception e) {
					snapshotCompletableFuture.setEndSnapshot();
					snapshotCompletableFuture.completeExceptionally(e);
					copiedReferencedDataPage.values().forEach(dataPage -> dataPage.release());
					LOG.error("Page handle error for {} with exception.", checkpointId, e);
				} finally {
					snapshotCompletableFuture.decRunningTask();
				}
			});
		} catch (Exception e) {
			snapshotCompletableFuture.setEndSnapshot();
			snapshotCompletableFuture.completeExceptionally(e);
			snapshotCompletableFuture.decRunningTask();
			LOG.error("add flushing segment failed with exception", e);
			throw e;
		}
	}
}





© 2015 - 2024 Weber Informatics LLC | Privacy Policy