All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.state.gemini.engine.memstore.AbstractWriteBuffer Maven / Gradle / Ivy

There is a newer version: 1.5.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.state.gemini.engine.memstore;

import org.apache.flink.runtime.state.gemini.engine.GRegion;
import org.apache.flink.runtime.state.gemini.engine.GRegionContext;
import org.apache.flink.runtime.state.gemini.engine.dbms.GContext;
import org.apache.flink.runtime.state.gemini.engine.exceptions.GeminiRuntimeException;
import org.apache.flink.runtime.state.gemini.engine.filecache.FileCache;
import org.apache.flink.runtime.state.gemini.engine.handler.PageHandler;
import org.apache.flink.runtime.state.gemini.engine.page.DataPage;
import org.apache.flink.runtime.state.gemini.engine.page.PageAddress;
import org.apache.flink.runtime.state.gemini.engine.page.PageIndex;
import org.apache.flink.runtime.state.gemini.engine.page.PageStore;
import org.apache.flink.runtime.state.gemini.engine.rm.ReferenceCount.ReleaseType;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotCompletableFuture;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotManager;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotOperation;
import org.apache.flink.runtime.state.gemini.engine.snapshot.SnapshotStat;
import org.apache.flink.util.Preconditions;

import org.apache.flink.shaded.netty4.io.netty.util.concurrent.EventExecutor;
import org.apache.flink.shaded.netty4.io.netty.util.concurrent.EventExecutorGroup;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;

/**
 * AbstractWriteBuffer.
 */
public abstract class AbstractWriteBuffer implements WriteBuffer {
	private static final Logger LOG = LoggerFactory.getLogger(AbstractWriteBuffer.class);
	protected final EventExecutor eventExecutor;
	protected final GRegionContext gRegionContext;
	protected long segmentID = 0;
	protected final GRegion gRegion;
	protected final PageStore pageStore;
	private final WriteBufferManager writeBufferManager;
	private long printTS = System.currentTimeMillis();
	private CompletableFuture lastFuture;

	public AbstractWriteBuffer(
		GRegion gRegion, EventExecutor eventExecutor, PageStore pageStore) {
		this.gRegionContext = gRegion.getGRegionContext();
		this.gRegion = gRegion;
		this.eventExecutor = eventExecutor;
		this.pageStore = pageStore;
		this.writeBufferManager = gRegionContext.getGContext().getSupervisor().getWriteBufferManager();
	}

	@Override
	public EventExecutor getExecutor() {
		return this.eventExecutor;
	}

	void checkResource() {
		//1. check if snapshot happened
		//2. check WriteBufferStats to decide if cut-off segment.
		//3. ask WriteBufferManager to block.
		//no estimated size now.
		long startTime = System.currentTimeMillis();
		if (printTS + 60000 < startTime) {
			printTS = startTime;
			if (LOG.isDebugEnabled()) {
				LOG.debug("writeBufferStats: {}", gRegionContext.getWriteBufferStats());
				LOG.debug("pageStoreStats: {}", gRegionContext.getPageStoreStats());
			}
		}
		if (gRegionContext.getWriteBufferStats().getAverageKeyLen() < 0) {
			if (gRegionContext.getWriteBufferStats().getAverageKeyLen() == -2) {
				//launch a estimateHandler
				lastFuture = new CompletableFuture();
				gRegionContext.getWriteBufferStats().setAverageKeyLen(-1);
				final Segment segmentCopy = getActiveSegment().copySegment();
				eventExecutor.execute(() -> {
					createPageHandler(segmentCopy, true).handle();
					lastFuture.complete(null);
				});
			}

			if (getActiveSegment().getRecordCount() > 1000) {
				if (!lastFuture.isDone()) {
					try {
						lastFuture.get(10, TimeUnit.MILLISECONDS);
					} catch (Exception e) {
						return;
					}
				}
			} else {
				return;
			}
		}

		//estimating MapValue and List based on it's element.
		int writeBufferEstimatedSize = getEstimatedSize(getActiveSegment().getRecordCount());
		int totalWriteBufferEstimatedSize = writeBufferEstimatedSize + getEstimatedSize(writeBufferManager.getTotalRecordCount());

		if (writeBufferEstimatedSize < gRegionContext.getWriteBufferWaterMark()) {
			if (totalWriteBufferEstimatedSize < writeBufferManager.getTotalMemSize() || !writeBufferManager.isBestChoiceWriteBufferFlushing(
				this)) {
				return;
			}
		}

		GContext gContext = gRegionContext.getGContext();
		long waitTime = System.currentTimeMillis() - startTime;
		while (gRegionContext.getWriteBufferStats().getFlushingSegmentCount() >= gContext.getGConfiguration().getNumFlushingSegment()) {
			gContext.checkDBStatus();
			if (writeBufferManager.canFlushWriteBuffer(this)) {
				break;
			}
			synchronized (this) {
				try {
					wait(1);
					//Let's see how much is it. if too much, then change it.
					writeBufferManager.increaseWriteBufferFlushBlock();
				} catch (InterruptedException e) {
					throw new GeminiRuntimeException(e);
				}
			}
		}

		if (waitTime > 10) {
			LOG.info("too much flushing segment or evict too long, wait time ={} ...", waitTime);
		}

		Segment rs = addFlushingSegment();
		gRegionContext.getWriteBufferStats().addTotalFlushingRecordCount(rs.getRecordCount());
		gRegionContext.getWriteBufferStats().addTotalRecordCount(-rs.getRecordCount());
		doSegmentFlush(rs, writeBufferEstimatedSize);
	}

	void doSegmentFlush(Segment segment, int estimatedSize) {
		eventExecutor.execute(() -> {
			try {
				PageHandler pageHandler = createPageHandler(segment, false);
				pageHandler.handle();
				//TODO to handle exception, add mechanism to re-flush this segment.
				endSegmentFlush(segment.getSegmentID());
			} catch (Exception e) {
				LOG.error("flush segment failed. {}", e);
			}
		});
	}

	private void endSegmentFlush(long segmentID) {
		Segment segment = pollFlushingSegment();
		Preconditions.checkArgument(segment != null, "error segment!");
		Preconditions.checkArgument(segment.getSegmentID() == segmentID, "error segment!");
		gRegionContext.getWriteBufferStats().addFlushingSegmentCount(-1);
		gRegionContext.getWriteBufferStats().addTotalFlushingRecordCount(-segment.getRecordCount());
		synchronized (this) {
			notify();
		}
	}

	public abstract Segment getActiveSegment();

	abstract Segment addFlushingSegment();

	abstract Segment pollFlushingSegment();

	abstract PageHandler createPageHandler(Segment segment, boolean onlyEstimatedSize);

	private int getEstimatedSize(long elementSize) {
		if (gRegionContext.getWriteBufferStats().getAverageKeyLen() <= 0) {
			LOG.error("Let's see whether it will happen!");
			return 0;
		}
		return (int) (gRegionContext.getPageStoreStats().getPageSizeRate() * (gRegionContext.getWriteBufferStats().getAverageKeyLen() + gRegionContext.getWriteBufferStats().getAverageValueLen()) * elementSize);
	}

	@Override
	public void doSnapshot(SnapshotOperation snapshotOperation) {
		SnapshotManager.PendingSnapshot pendingSnapshot = snapshotOperation.getPendingSnapshot();
		long checkpointId = pendingSnapshot.getCheckpointId();
		SnapshotCompletableFuture snapshotCompletableFuture = pendingSnapshot.getResultFuture();
		if (snapshotCompletableFuture.isEndSnapshot()) {
			return;
		}
		boolean isLocalSnapshotEnabled = gRegionContext.getGContext().getGConfiguration().isLocalSnapshotEnabled();
		snapshotCompletableFuture.incRunningTask();

		try {
			Segment rs = addFlushingSegment();
			PageHandler pageHandler = createPageHandler(rs, false);
			gRegionContext.getWriteBufferStats().addTotalFlushingRecordCount(rs.getRecordCount());
			gRegionContext.getWriteBufferStats().addTotalRecordCount(-rs.getRecordCount());

			// segment to page.
			if (LOG.isDebugEnabled()) {
				LOG.debug("Start to snapshot write buffer for {}.", checkpointId);
			}
			// TODO #SR error handle and UT
			eventExecutor.execute(() -> {
				final Map allAddReferenceDataPage = new HashMap<>();
				try {
					pageHandler.handle();
					endSegmentFlush(rs.getSegmentID());
					// copy page index
					PageIndex copyPageIndex = pageStore.getPageIndex().deepCopy(allAddReferenceDataPage);
					pendingSnapshot.addGRegionSnapshotMeta(gRegionContext.getTableName(),
						gRegionContext.getRegionId(),
						copyPageIndex,
						gRegionContext.getLastSeqID(),
						gRegionContext.getRemoveAllSeqID());

					if (LOG.isDebugEnabled()) {
						LOG.debug("Segment flush and pageIndex copy done for {}, will start to flush.", checkpointId);
					}
					// increase running task for the submitted task.
					snapshotCompletableFuture.incRunningTask();
					gRegionContext.getGContext().getSupervisor().getSnapshotManager().getSnapshotExecutor().execute(() -> {
						try {
							Iterator pageIterator = copyPageIndex.pageIterator();
							EventExecutorGroup snapshotEventExecutorGroup = gRegionContext.getGContext().getSupervisor().getSnapshotExecutorGroup();
							FileCache fileCache = gRegionContext.getGContext().getSupervisor().getFileCache();
							int totalPage = 0;
							long totalSize = 0;
							int incrementalPages = 0;
							long incrementalSize = 0;
							int totalLocalPage = 0;
							long totalLocalSize = 0;
							int localIncrementalPages = 0;
							long localIncrementalSize = 0;
							while (gRegionContext.getGContext().isDBNormal() && pageIterator.hasNext() && !snapshotCompletableFuture.isEndSnapshot()) {
								PageAddress pageAddress = pageIterator.next();
								try {
									EventExecutor snapshotEventExecutor = snapshotEventExecutorGroup.next();
									snapshotCompletableFuture.incRunningTask();
									if (!pageAddress.isDfsValid()) {
										++incrementalPages;
										incrementalSize += pageAddress.getDataLen();
									}
									// TODO We can deliver a Callable to fileCache.flushPage, and execute it before async flush,
									// but i think it's a bit ugly, and we will discuss it later. For now, it takes no effect in file cache.
									fileCache.flushPage(pageAddress, gRegionContext, snapshotEventExecutor,
										// TODO this should be true for the first snapshot after scale.
										false, (success, throwable) -> {
											if (!success) {
												LOG.error("Write error when snapshot dfs.");
												snapshotCompletableFuture.setEndSnapshot();
												snapshotCompletableFuture.completeExceptionally(throwable);
											}
											snapshotCompletableFuture.decRunningTask();
										});

									// dfs statistics
									++totalPage;
									totalSize += pageAddress.getDataLen();

									if (isLocalSnapshotEnabled) {
										snapshotCompletableFuture.incRunningTask();
										if (!pageAddress.isLocalValid()) {
											++localIncrementalPages;
											localIncrementalSize += pageAddress.getDataLen();
										}
										fileCache.addPage(pageAddress,
											gRegionContext,
											snapshotEventExecutor,
											(success, throwable) -> {
												if (!success) {
													LOG.error("Write error when snapshot local.");
													snapshotCompletableFuture.setEndSnapshot();
													snapshotCompletableFuture.completeExceptionally(throwable);
												}
												snapshotCompletableFuture.decRunningTask();
											});
										++totalLocalPage;
										totalLocalSize += pageAddress.getDataLen();
									}
									// remove for fast GC. not support retry snapshot.
									pageIterator.remove();
								} finally {
									//best effort to fast free space
									DataPage dataPage = allAddReferenceDataPage.remove(pageAddress);
									if (dataPage != null) {
										dataPage.delReferenceCount(ReleaseType.Normal);
									}
								}

							}
							SnapshotStat snapshotStat = pendingSnapshot.getSnapshotStat();
							snapshotStat.addAndGetTotalPages(totalPage);
							snapshotStat.addAndGetTotalSize(totalSize);
							snapshotStat.addAndGetIncrementalPages(incrementalPages);
							snapshotStat.addAndGetIncrementalSize(incrementalSize);
							snapshotStat.addAndGetTotalLocalPages(totalLocalPage);
							snapshotStat.addAndGetTotalLocalSize(totalLocalSize);
							snapshotStat.addAndGetLocalIncrementalPages(localIncrementalPages);
							snapshotStat.addAndGetLocalIncrementalSize(localIncrementalSize);
							snapshotCompletableFuture.decRunningTask();
						} finally {
							allAddReferenceDataPage.values().forEach(datapage -> datapage.delReferenceCount(ReleaseType.Normal));
						}
					});
				} catch (Exception e) {
					snapshotCompletableFuture.setEndSnapshot();
					snapshotCompletableFuture.completeExceptionally(e);
					allAddReferenceDataPage.values().forEach(datapage -> datapage.delReferenceCount(ReleaseType.Normal));
					LOG.error("Page handle error for {} with exception {}.", checkpointId, e.getMessage(), e);
				} finally {
					snapshotCompletableFuture.decRunningTask();
				}
			});
		} catch (Exception e) {
			snapshotCompletableFuture.decRunningTask();
			snapshotCompletableFuture.setEndSnapshot();
			snapshotCompletableFuture.completeExceptionally(e);
			LOG.error("add flushing segment failed with exception {}", e);
			throw e;
		}
	}
}





© 2015 - 2024 Weber Informatics LLC | Privacy Policy