All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.state.SharedStateRegistryImpl Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.state;

import org.apache.flink.annotation.Internal;
import org.apache.flink.core.execution.RecoveryClaimMode;
import org.apache.flink.core.fs.FSDataInputStream;
import org.apache.flink.runtime.checkpoint.CompletedCheckpoint;
import org.apache.flink.runtime.checkpoint.SnapshotType.SharingFilesStrategy;
import org.apache.flink.util.concurrent.Executors;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.Executor;
import java.util.concurrent.RejectedExecutionException;

import static org.apache.flink.runtime.checkpoint.SnapshotType.SharingFilesStrategy.NO_SHARING;
import static org.apache.flink.util.Preconditions.checkNotNull;
import static org.apache.flink.util.Preconditions.checkState;

/** {@link SharedStateRegistry} implementation. */
@Internal
public class SharedStateRegistryImpl implements SharedStateRegistry {

    private static final Logger LOG = LoggerFactory.getLogger(SharedStateRegistryImpl.class);

    /** All registered state objects by an artificial key */
    private final Map registeredStates;

    private final Map> restoredCheckpointSharingStrategies =
            new HashMap<>();

    /** This flag indicates whether or not the registry is open or if close() was called */
    private boolean open;

    /** Executor for async state deletion */
    private final Executor asyncDisposalExecutor;

    /** Checkpoint ID below which no state is discarded, inclusive. */
    private long highestNotClaimedCheckpointID = -1L;

    /** Default uses direct executor to delete unreferenced state */
    public SharedStateRegistryImpl() {
        this(Executors.directExecutor());
    }

    public SharedStateRegistryImpl(Executor asyncDisposalExecutor) {
        this.registeredStates = new HashMap<>();
        this.asyncDisposalExecutor = checkNotNull(asyncDisposalExecutor);
        this.open = true;
    }

    @Override
    public StreamStateHandle registerReference(
            final SharedStateRegistryKey registrationKey,
            final StreamStateHandle newHandle,
            final long checkpointID,
            final boolean preventDiscardingCreatedCheckpoint) {

        checkNotNull(newHandle, "State handle should not be null.");

        SharedStateEntry entry;

        synchronized (registeredStates) {
            checkState(open, "Attempt to register state to closed SharedStateRegistry.");

            entry = registeredStates.get(registrationKey);

            if (entry == null) {
                checkState(
                        !isPlaceholder(newHandle),
                        "Attempt to reference unknown state: " + registrationKey);

                LOG.trace(
                        "Registered new shared state {} under key {}.", newHandle, registrationKey);
                entry = new SharedStateEntry(newHandle, checkpointID);
                registeredStates.put(registrationKey, entry);

                // no further handling
                return entry.stateHandle;

            } else if (entry.stateHandle == newHandle) {
                // might be a bug but state backend is not required to use a place-holder
                LOG.info(
                        "Duplicated registration under key {} with the same object: {}",
                        registrationKey,
                        newHandle);
            } else if (Objects.equals(entry.stateHandle, newHandle)) {
                LOG.trace(
                        "Duplicated registration under key {} with the new object: {}.",
                        registrationKey,
                        newHandle);
            } else if (isPlaceholder(newHandle)) {
                LOG.trace(
                        "Duplicated registration under key {} with a placeholder (normal case)",
                        registrationKey);
            } else {
                // might be a bug expect the StreamStateHandleWrapper used by
                // ChangelogStateBackendHandleImpl
                LOG.info(
                        "the registered handle should equal to the previous one or is a placeholder, register key:{}, handle:{}",
                        registrationKey,
                        newHandle);
                if (entry.stateHandle instanceof EmptyDiscardStateObjectForRegister) {
                    // This situation means that newHandle is a StreamStateHandleWrapper registered
                    // by ChangelogStateBackendHandleImpl, keep the new one for discard the
                    // underlying handle while it was useless. Refactor this once FLINK-25862 is
                    // resolved.
                    entry.stateHandle = newHandle;
                } else {
                    throw new IllegalStateException(
                            "StateObjects underlying same key should be equal !");
                }
            }

            LOG.trace(
                    "Updating last checkpoint for {} from {} to {}",
                    registrationKey,
                    entry.lastUsedCheckpointID,
                    checkpointID);
            entry.advanceLastUsingCheckpointID(checkpointID);

            if (preventDiscardingCreatedCheckpoint) {
                entry.preventDiscardingCreatedCheckpoint();
            }
        } // end of synchronized (registeredStates)

        return entry.stateHandle;
    }

    @Override
    public Set unregisterUnusedState(long lowestCheckpointID) {
        Set checkpointInUse = new HashSet<>();
        LOG.debug(
                "Discard state created before checkpoint {} and not used afterwards",
                lowestCheckpointID);
        List subsumed = new ArrayList<>();
        // Iterate over all the registered state handles.
        // Using a simple loop and NOT index by checkpointID because:
        // 1. Maintaining index leads to the same time complexity and worse memory complexity
        // 2. Most of the entries are expected to be carried to the next checkpoint
        synchronized (registeredStates) {
            Iterator it = registeredStates.values().iterator();
            while (it.hasNext()) {
                SharedStateEntry entry = it.next();
                if (entry.lastUsedCheckpointID < lowestCheckpointID) {
                    if (entry.createdByCheckpointID > highestNotClaimedCheckpointID) {
                        subsumed.add(entry.stateHandle);
                    }
                    it.remove();
                } else if (preventsDiscardingCreatedCheckpoint(entry)) {
                    // Newly created checkpoints can be discarded right after subsumption. But the
                    // initial checkpoint needs to be kept until all of its private AND shared state
                    // is not in use. This is to enable recovery in CLAIM mode from:
                    // - native incremental savepoints
                    // - non-changelog checkpoints with changelog enabled
                    // Keeping any checkpoint for longer leaves its folder undeleted on job
                    // cancellation (and also on crash or JM failover).
                    checkpointInUse.add(entry.createdByCheckpointID);
                }
            }
        }
        LOG.trace("Discard {} state asynchronously", subsumed.size());
        for (StreamStateHandle handle : subsumed) {
            scheduleAsyncDelete(handle);
        }
        return checkpointInUse;
    }

    @Override
    public void registerAll(
            Iterable stateHandles, long checkpointID) {

        if (stateHandles == null) {
            return;
        }

        synchronized (registeredStates) {
            for (CompositeStateHandle stateHandle : stateHandles) {
                stateHandle.registerSharedStates(this, checkpointID);
            }
        }
    }

    @Override
    public void registerAllAfterRestored(CompletedCheckpoint checkpoint, RecoveryClaimMode mode) {
        registerAll(checkpoint.getOperatorStates().values(), checkpoint.getCheckpointID());
        restoredCheckpointSharingStrategies.put(
                checkpoint.getCheckpointID(),
                checkpoint
                        .getRestoredProperties()
                        .map(props -> props.getCheckpointType().getSharingFilesStrategy()));
        // In NO_CLAIM and LEGACY claim modes, shared state of the initial checkpoints must be
        // preserved. This is achieved by advancing highestRetainCheckpointID here, and then
        // checking entry.createdByCheckpointID against it on checkpoint subsumption.
        // In CLAIM mode, the shared state of the initial checkpoints must be
        // discarded as soon as it becomes unused - so highestRetainCheckpointID is not updated.
        if (mode != RecoveryClaimMode.CLAIM) {
            highestNotClaimedCheckpointID =
                    Math.max(highestNotClaimedCheckpointID, checkpoint.getCheckpointID());
        }
    }

    @Override
    public void checkpointCompleted(long checkpointId) {
        // nothing to do here
    }

    @Override
    public String toString() {
        synchronized (registeredStates) {
            return "SharedStateRegistry{" + "registeredStates=" + registeredStates + '}';
        }
    }

    private void scheduleAsyncDelete(StreamStateHandle streamStateHandle) {
        // We do the small optimization to not issue discards for placeholders, which are NOPs.
        if (streamStateHandle != null && !isPlaceholder(streamStateHandle)) {
            LOG.debug("Scheduled delete of state handle {}.", streamStateHandle);
            AsyncDisposalRunnable asyncDisposalRunnable =
                    new AsyncDisposalRunnable(streamStateHandle);
            try {
                asyncDisposalExecutor.execute(asyncDisposalRunnable);
            } catch (RejectedExecutionException ex) {
                // TODO This is a temporary fix for a problem during
                // ZooKeeperCompletedCheckpointStore#shutdown:
                // Disposal is issued in another async thread and the shutdown proceeds to close the
                // I/O Executor pool.
                // This leads to RejectedExecutionException once the async deletes are triggered by
                // ZK. We need to
                // wait for all pending ZK deletes before closing the I/O Executor pool. We can
                // simply call #run()
                // because we are already in the async ZK thread that disposes the handles.
                asyncDisposalRunnable.run();
            }
        }
    }

    private boolean isPlaceholder(StreamStateHandle stateHandle) {
        return stateHandle instanceof PlaceholderStreamStateHandle;
    }

    @Override
    public void close() {
        synchronized (registeredStates) {
            open = false;
        }
    }

    /** Encapsulates the operation the delete state handles asynchronously. */
    private static final class AsyncDisposalRunnable implements Runnable {

        private final StateObject toDispose;

        public AsyncDisposalRunnable(StateObject toDispose) {
            this.toDispose = checkNotNull(toDispose);
        }

        @Override
        public void run() {
            try {
                toDispose.discardState();
            } catch (Exception e) {
                LOG.warn(
                        "A problem occurred during asynchronous disposal of a shared state object: {}",
                        toDispose,
                        e);
            }
        }
    }
    /** An entry in the registry, tracking the handle and the corresponding reference count. */
    private static final class SharedStateEntry {

        /**
         * Whether usage of this state should prevent deletion of the checkpoint that created this
         * state.
         */
        private boolean preventDiscardingCreatedCheckpoint = false;

        /** The shared state handle */
        StreamStateHandle stateHandle;

        private final long createdByCheckpointID;

        private long lastUsedCheckpointID;

        SharedStateEntry(StreamStateHandle value, long checkpointID) {
            this.stateHandle = value;
            this.createdByCheckpointID = checkpointID;
            this.lastUsedCheckpointID = checkpointID;
        }

        @Override
        public String toString() {
            return "SharedStateEntry{"
                    + "stateHandle="
                    + stateHandle
                    + ", createdByCheckpointID="
                    + createdByCheckpointID
                    + ", lastUsedCheckpointID="
                    + lastUsedCheckpointID
                    + '}';
        }

        private void advanceLastUsingCheckpointID(long checkpointID) {
            lastUsedCheckpointID = Math.max(checkpointID, lastUsedCheckpointID);
        }

        private void preventDiscardingCreatedCheckpoint() {
            // Changed from false to true when a newer checkpoint starts reusing this state entry
            // after recovery. This is to delay discarding the checkpoint until all of its
            // state (both shared and private) is not used. That allows to handle transition from
            // changelog off to on in CLAIM mode.
            this.preventDiscardingCreatedCheckpoint = true;
        }
    }

    /** An object with empty discardState for registering. */
    public static class EmptyDiscardStateObjectForRegister implements StreamStateHandle {
        private static final long serialVersionUID = 1L;

        private StateHandleID stateHandleID;

        public EmptyDiscardStateObjectForRegister(StateHandleID stateHandleID) {
            this.stateHandleID = stateHandleID;
        }

        @Override
        public void discardState() throws Exception {}

        @Override
        public long getStateSize() {
            throw new UnsupportedOperationException("Should not call here.");
        }

        @Override
        public FSDataInputStream openInputStream() throws IOException {
            throw new UnsupportedOperationException("Should not call here.");
        }

        @Override
        public Optional asBytesIfInMemory() {
            throw new UnsupportedOperationException("Should not call here.");
        }

        @Override
        public PhysicalStateHandleID getStreamStateHandleID() {
            throw new UnsupportedOperationException("Should not call here.");
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) {
                return true;
            }
            if (o == null || getClass() != o.getClass()) {
                return false;
            }
            EmptyDiscardStateObjectForRegister that = (EmptyDiscardStateObjectForRegister) o;
            return Objects.equals(stateHandleID, that.stateHandleID);
        }

        @Override
        public int hashCode() {
            return Objects.hash(stateHandleID);
        }

        @Override
        public String toString() {
            return "EmptyDiscardStateObject{" + stateHandleID + '}';
        }
    }

    private boolean preventsDiscardingCreatedCheckpoint(SharedStateEntry entry) {
        // explicitly set by the backend, e.g. private state is reused
        if (entry.preventDiscardingCreatedCheckpoint
                && restoredCheckpointSharingStrategies.containsKey(entry.createdByCheckpointID)) {
            return true;
        }
        // With NO_SHARING strategy, shared state, if any, is bundled inside the checkpoint folder.
        // So the folder deletion should be delayed as long as some shared state is still in use.
        // That allows to recover from Incremental RocksDB Native Savepoint in CLAIM mode.
        // noinspection RedundantIfStatement
        if (restoredCheckpointSharingStrategies
                .getOrDefault(entry.createdByCheckpointID, Optional.empty())
                .filter(sharingFilesStrategy -> sharingFilesStrategy == NO_SHARING)
                .isPresent()) {
            return true;
        }

        return false;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy