All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.checkpoint.Checkpoints Maven / Gradle / Ivy

There is a newer version: 1.13.6
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.checkpoint;

import org.apache.flink.api.common.JobID;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.checkpoint.savepoint.Savepoint;
import org.apache.flink.runtime.checkpoint.savepoint.SavepointSerializer;
import org.apache.flink.runtime.checkpoint.savepoint.SavepointSerializers;
import org.apache.flink.runtime.checkpoint.savepoint.SavepointV2;
import org.apache.flink.runtime.executiongraph.ExecutionJobVertex;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobgraph.OperatorID;
import org.apache.flink.runtime.state.CompletedCheckpointStorageLocation;
import org.apache.flink.runtime.state.StateBackend;
import org.apache.flink.runtime.state.StateBackendLoader;
import org.apache.flink.runtime.state.StreamStateHandle;
import org.apache.flink.runtime.state.memory.MemoryStateBackend;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.FlinkException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Map;

import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * A utility class with the methods to write/load/dispose the checkpoint and savepoint metadata.
 *
 * 

Stored checkpoint metadata files have the following format: *

[MagicNumber (int) | Format Version (int) | Checkpoint Metadata (variable)]
* *

The actual savepoint serialization is version-specific via the {@link SavepointSerializer}. */ public class Checkpoints { private static final Logger LOG = LoggerFactory.getLogger(Checkpoints.class); /** Magic number at the beginning of every checkpoint metadata file, for sanity checks. */ public static final int HEADER_MAGIC_NUMBER = 0x4960672d; // ------------------------------------------------------------------------ // Writing out checkpoint metadata // ------------------------------------------------------------------------ public static void storeCheckpointMetadata( T checkpointMetadata, OutputStream out) throws IOException { DataOutputStream dos = new DataOutputStream(out); storeCheckpointMetadata(checkpointMetadata, dos); } public static void storeCheckpointMetadata( T checkpointMetadata, DataOutputStream out) throws IOException { // write generic header out.writeInt(HEADER_MAGIC_NUMBER); out.writeInt(checkpointMetadata.getVersion()); // write checkpoint metadata SavepointSerializer serializer = SavepointSerializers.getSerializer(checkpointMetadata); serializer.serialize(checkpointMetadata, out); } // ------------------------------------------------------------------------ // Reading and validating checkpoint metadata // ------------------------------------------------------------------------ public static Savepoint loadCheckpointMetadata(DataInputStream in, ClassLoader classLoader) throws IOException { checkNotNull(in, "input stream"); checkNotNull(classLoader, "classLoader"); final int magicNumber = in.readInt(); if (magicNumber == HEADER_MAGIC_NUMBER) { final int version = in.readInt(); final SavepointSerializer serializer = SavepointSerializers.getSerializer(version); if (serializer != null) { return serializer.deserialize(in, classLoader); } else { throw new IOException("Unrecognized checkpoint version number: " + version); } } else { throw new IOException("Unexpected magic number. This can have multiple reasons: " + "(1) You are trying to load a Flink 1.0 savepoint, which is not supported by this " + "version of Flink. (2) The file you were pointing to is not a savepoint at all. " + "(3) The savepoint file has been corrupted."); } } @SuppressWarnings("deprecation") public static CompletedCheckpoint loadAndValidateCheckpoint( JobID jobId, Map tasks, CompletedCheckpointStorageLocation location, ClassLoader classLoader, boolean allowNonRestoredState) throws IOException { checkNotNull(jobId, "jobId"); checkNotNull(tasks, "tasks"); checkNotNull(location, "location"); checkNotNull(classLoader, "classLoader"); final StreamStateHandle metadataHandle = location.getMetadataHandle(); final String checkpointPointer = location.getExternalPointer(); // (1) load the savepoint final Savepoint rawCheckpointMetadata; try (InputStream in = metadataHandle.openInputStream()) { DataInputStream dis = new DataInputStream(in); rawCheckpointMetadata = loadCheckpointMetadata(dis, classLoader); } final Savepoint checkpointMetadata = rawCheckpointMetadata.getTaskStates() == null ? rawCheckpointMetadata : SavepointV2.convertToOperatorStateSavepointV2(tasks, rawCheckpointMetadata); // generate mapping from operator to task Map operatorToJobVertexMapping = new HashMap<>(); for (ExecutionJobVertex task : tasks.values()) { for (OperatorID operatorID : task.getOperatorIDs()) { operatorToJobVertexMapping.put(operatorID, task); } } // (2) validate it (parallelism, etc) boolean expandedToLegacyIds = false; HashMap operatorStates = new HashMap<>(checkpointMetadata.getOperatorStates().size()); for (OperatorState operatorState : checkpointMetadata.getOperatorStates()) { ExecutionJobVertex executionJobVertex = operatorToJobVertexMapping.get(operatorState.getOperatorID()); // on the first time we can not find the execution job vertex for an id, we also consider alternative ids, // for example as generated from older flink versions, to provide backwards compatibility. if (executionJobVertex == null && !expandedToLegacyIds) { operatorToJobVertexMapping = ExecutionJobVertex.includeAlternativeOperatorIDs(operatorToJobVertexMapping); executionJobVertex = operatorToJobVertexMapping.get(operatorState.getOperatorID()); expandedToLegacyIds = true; LOG.info("Could not find ExecutionJobVertex. Including user-defined OperatorIDs in search."); } if (executionJobVertex != null) { if (executionJobVertex.getMaxParallelism() == operatorState.getMaxParallelism() || !executionJobVertex.isMaxParallelismConfigured()) { operatorStates.put(operatorState.getOperatorID(), operatorState); } else { String msg = String.format("Failed to rollback to checkpoint/savepoint %s. " + "Max parallelism mismatch between checkpoint/savepoint state and new program. " + "Cannot map operator %s with max parallelism %d to new program with " + "max parallelism %d. This indicates that the program has been changed " + "in a non-compatible way after the checkpoint/savepoint.", checkpointMetadata, operatorState.getOperatorID(), operatorState.getMaxParallelism(), executionJobVertex.getMaxParallelism()); throw new IllegalStateException(msg); } } else if (allowNonRestoredState) { LOG.info("Skipping savepoint state for operator {}.", operatorState.getOperatorID()); } else { for (OperatorSubtaskState operatorSubtaskState : operatorState.getStates()) { if (operatorSubtaskState.hasState()) { String msg = String.format("Failed to rollback to checkpoint/savepoint %s. " + "Cannot map checkpoint/savepoint state for operator %s to the new program, " + "because the operator is not available in the new program. If " + "you want to allow to skip this, you can set the --allowNonRestoredState " + "option on the CLI.", checkpointPointer, operatorState.getOperatorID()); throw new IllegalStateException(msg); } } LOG.info("Skipping empty savepoint state for operator {}.", operatorState.getOperatorID()); } } // (3) convert to checkpoint so the system can fall back to it CheckpointProperties props = CheckpointProperties.forSavepoint(); return new CompletedCheckpoint( jobId, checkpointMetadata.getCheckpointId(), 0L, 0L, operatorStates, checkpointMetadata.getMasterStates(), props, location); } // ------------------------------------------------------------------------ // Savepoint Disposal Hooks // ------------------------------------------------------------------------ public static void disposeSavepoint( String pointer, StateBackend stateBackend, ClassLoader classLoader) throws IOException, FlinkException { checkNotNull(pointer, "location"); checkNotNull(stateBackend, "stateBackend"); checkNotNull(classLoader, "classLoader"); final CompletedCheckpointStorageLocation checkpointLocation = stateBackend.resolveCheckpoint(pointer); final StreamStateHandle metadataHandle = checkpointLocation.getMetadataHandle(); // load the savepoint object (the metadata) to have all the state handles that we need // to dispose of all state final Savepoint savepoint; try (InputStream in = metadataHandle.openInputStream(); DataInputStream dis = new DataInputStream(in)) { savepoint = loadCheckpointMetadata(dis, classLoader); } Exception exception = null; // first dispose the savepoint metadata, so that the savepoint is not // addressable any more even if the following disposal fails try { metadataHandle.discardState(); } catch (Exception e) { exception = e; } // now dispose the savepoint data try { savepoint.dispose(); } catch (Exception e) { exception = ExceptionUtils.firstOrSuppressed(e, exception); } // now dispose the location (directory, table, whatever) try { checkpointLocation.disposeStorageLocation(); } catch (Exception e) { exception = ExceptionUtils.firstOrSuppressed(e, exception); } // forward exceptions caught in the process if (exception != null) { ExceptionUtils.rethrowIOException(exception); } } public static void disposeSavepoint( String pointer, Configuration configuration, ClassLoader classLoader, @Nullable Logger logger) throws IOException, FlinkException { checkNotNull(pointer, "location"); checkNotNull(configuration, "configuration"); checkNotNull(classLoader, "classLoader"); StateBackend backend = loadStateBackend(configuration, classLoader, logger); disposeSavepoint(pointer, backend, classLoader); } @Nonnull public static StateBackend loadStateBackend(Configuration configuration, ClassLoader classLoader, @Nullable Logger logger) { if (logger != null) { logger.info("Attempting to load configured state backend for savepoint disposal"); } StateBackend backend = null; try { backend = StateBackendLoader.loadStateBackendFromConfig(configuration, classLoader, null); if (backend == null && logger != null) { logger.info("No state backend configured, attempting to dispose savepoint " + "with default backend (file system based)"); } } catch (Throwable t) { // catches exceptions and errors (like linking errors) if (logger != null) { logger.info("Could not load configured state backend."); logger.debug("Detailed exception:", t); } } if (backend == null) { // We use the memory state backend by default. The MemoryStateBackend is actually // FileSystem-based for metadata backend = new MemoryStateBackend(); } return backend; } // ------------------------------------------------------------------------ /** This class contains only static utility methods and is not meant to be instantiated. */ private Checkpoints() {} }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy