All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.checkpoint.Checkpoints Maven / Gradle / Ivy

There is a newer version: 1.13.6
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.checkpoint;

import org.apache.flink.api.common.JobID;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.OperatorIDPair;
import org.apache.flink.runtime.checkpoint.metadata.CheckpointMetadata;
import org.apache.flink.runtime.checkpoint.metadata.MetadataSerializer;
import org.apache.flink.runtime.checkpoint.metadata.MetadataSerializers;
import org.apache.flink.runtime.checkpoint.metadata.MetadataV3Serializer;
import org.apache.flink.runtime.executiongraph.ExecutionJobVertex;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobgraph.OperatorID;
import org.apache.flink.runtime.state.CheckpointStorage;
import org.apache.flink.runtime.state.CheckpointStorageLoader;
import org.apache.flink.runtime.state.CompletedCheckpointStorageLocation;
import org.apache.flink.runtime.state.StateBackend;
import org.apache.flink.runtime.state.StateBackendLoader;
import org.apache.flink.runtime.state.StreamStateHandle;
import org.apache.flink.runtime.state.hashmap.HashMapStateBackend;
import org.apache.flink.runtime.state.storage.JobManagerCheckpointStorage;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.FlinkException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Map;

import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * A utility class with the methods to write/load/dispose the checkpoint and savepoint metadata.
 *
 * 

Stored checkpoint metadata files have the following format: * *

[MagicNumber (int) | Format Version (int) | Checkpoint Metadata (variable)]
* *

The actual savepoint serialization is version-specific via the {@link MetadataSerializer}. */ public class Checkpoints { private static final Logger LOG = LoggerFactory.getLogger(Checkpoints.class); /** Magic number at the beginning of every checkpoint metadata file, for sanity checks. */ public static final int HEADER_MAGIC_NUMBER = 0x4960672d; // ------------------------------------------------------------------------ // Writing out checkpoint metadata // ------------------------------------------------------------------------ public static void storeCheckpointMetadata( CheckpointMetadata checkpointMetadata, OutputStream out) throws IOException { DataOutputStream dos = new DataOutputStream(out); storeCheckpointMetadata(checkpointMetadata, dos); } public static void storeCheckpointMetadata( CheckpointMetadata checkpointMetadata, DataOutputStream out) throws IOException { // write generic header out.writeInt(HEADER_MAGIC_NUMBER); out.writeInt(MetadataV3Serializer.VERSION); MetadataV3Serializer.serialize(checkpointMetadata, out); } // ------------------------------------------------------------------------ // Reading and validating checkpoint metadata // ------------------------------------------------------------------------ public static CheckpointMetadata loadCheckpointMetadata( DataInputStream in, ClassLoader classLoader, String externalPointer) throws IOException { checkNotNull(in, "input stream"); checkNotNull(classLoader, "classLoader"); final int magicNumber = in.readInt(); if (magicNumber == HEADER_MAGIC_NUMBER) { final int version = in.readInt(); final MetadataSerializer serializer = MetadataSerializers.getSerializer(version); return serializer.deserialize(in, classLoader, externalPointer); } else { throw new IOException( "Unexpected magic number. This can have multiple reasons: " + "(1) You are trying to load a Flink 1.0 savepoint, which is not supported by this " + "version of Flink. (2) The file you were pointing to is not a savepoint at all. " + "(3) The savepoint file has been corrupted."); } } public static CompletedCheckpoint loadAndValidateCheckpoint( JobID jobId, Map tasks, CompletedCheckpointStorageLocation location, ClassLoader classLoader, boolean allowNonRestoredState) throws IOException { checkNotNull(jobId, "jobId"); checkNotNull(tasks, "tasks"); checkNotNull(location, "location"); checkNotNull(classLoader, "classLoader"); final StreamStateHandle metadataHandle = location.getMetadataHandle(); final String checkpointPointer = location.getExternalPointer(); // (1) load the savepoint final CheckpointMetadata checkpointMetadata; try (InputStream in = metadataHandle.openInputStream()) { DataInputStream dis = new DataInputStream(in); checkpointMetadata = loadCheckpointMetadata(dis, classLoader, checkpointPointer); } // generate mapping from operator to task Map operatorToJobVertexMapping = new HashMap<>(); for (ExecutionJobVertex task : tasks.values()) { for (OperatorIDPair operatorIDPair : task.getOperatorIDs()) { operatorToJobVertexMapping.put(operatorIDPair.getGeneratedOperatorID(), task); operatorIDPair .getUserDefinedOperatorID() .ifPresent(id -> operatorToJobVertexMapping.put(id, task)); } } // (2) validate it (parallelism, etc) HashMap operatorStates = new HashMap<>(checkpointMetadata.getOperatorStates().size()); for (OperatorState operatorState : checkpointMetadata.getOperatorStates()) { ExecutionJobVertex executionJobVertex = operatorToJobVertexMapping.get(operatorState.getOperatorID()); if (executionJobVertex != null) { if (executionJobVertex.getMaxParallelism() == operatorState.getMaxParallelism() || executionJobVertex.canRescaleMaxParallelism( operatorState.getMaxParallelism())) { operatorStates.put(operatorState.getOperatorID(), operatorState); } else { String msg = String.format( "Failed to rollback to checkpoint/savepoint %s. " + "Max parallelism mismatch between checkpoint/savepoint state and new program. " + "Cannot map operator %s with max parallelism %d to new program with " + "max parallelism %d. This indicates that the program has been changed " + "in a non-compatible way after the checkpoint/savepoint.", checkpointMetadata, operatorState.getOperatorID(), operatorState.getMaxParallelism(), executionJobVertex.getMaxParallelism()); throw new IllegalStateException(msg); } } else if (allowNonRestoredState) { LOG.info( "Skipping savepoint state for operator {}.", operatorState.getOperatorID()); } else { if (operatorState.getCoordinatorState() != null) { throwNonRestoredStateException( checkpointPointer, operatorState.getOperatorID()); } for (OperatorSubtaskState operatorSubtaskState : operatorState.getStates()) { if (operatorSubtaskState.hasState()) { throwNonRestoredStateException( checkpointPointer, operatorState.getOperatorID()); } } LOG.info( "Skipping empty savepoint state for operator {}.", operatorState.getOperatorID()); } } // (3) convert to checkpoint so the system can fall back to it CheckpointProperties props = CheckpointProperties.forSavepoint(false); return new CompletedCheckpoint( jobId, checkpointMetadata.getCheckpointId(), 0L, 0L, operatorStates, checkpointMetadata.getMasterStates(), props, location); } private static void throwNonRestoredStateException( String checkpointPointer, OperatorID operatorId) { String msg = String.format( "Failed to rollback to checkpoint/savepoint %s. " + "Cannot map checkpoint/savepoint state for operator %s to the new program, " + "because the operator is not available in the new program. If " + "you want to allow to skip this, you can set the --allowNonRestoredState " + "option on the CLI.", checkpointPointer, operatorId); throw new IllegalStateException(msg); } // ------------------------------------------------------------------------ // Savepoint Disposal Hooks // ------------------------------------------------------------------------ public static void disposeSavepoint( String pointer, CheckpointStorage checkpointStorage, ClassLoader classLoader) throws IOException, FlinkException { checkNotNull(pointer, "location"); checkNotNull(checkpointStorage, "stateBackend"); checkNotNull(classLoader, "classLoader"); final CompletedCheckpointStorageLocation checkpointLocation = checkpointStorage.resolveCheckpoint(pointer); final StreamStateHandle metadataHandle = checkpointLocation.getMetadataHandle(); // load the savepoint object (the metadata) to have all the state handles that we need // to dispose of all state final CheckpointMetadata metadata; try (InputStream in = metadataHandle.openInputStream(); DataInputStream dis = new DataInputStream(in)) { metadata = loadCheckpointMetadata(dis, classLoader, pointer); } Exception exception = null; // first dispose the savepoint metadata, so that the savepoint is not // addressable any more even if the following disposal fails try { metadataHandle.discardState(); } catch (Exception e) { exception = e; } // now dispose the savepoint data try { metadata.dispose(); } catch (Exception e) { exception = ExceptionUtils.firstOrSuppressed(e, exception); } // now dispose the location (directory, table, whatever) try { checkpointLocation.disposeStorageLocation(); } catch (Exception e) { exception = ExceptionUtils.firstOrSuppressed(e, exception); } // forward exceptions caught in the process if (exception != null) { ExceptionUtils.rethrowIOException(exception); } } public static void disposeSavepoint( String pointer, Configuration configuration, ClassLoader classLoader, @Nullable Logger logger) throws IOException, FlinkException { checkNotNull(pointer, "location"); checkNotNull(configuration, "configuration"); checkNotNull(classLoader, "classLoader"); CheckpointStorage storage = loadCheckpointStorage(configuration, classLoader, logger); disposeSavepoint(pointer, storage, classLoader); } @Nonnull public static StateBackend loadStateBackend( Configuration configuration, ClassLoader classLoader, @Nullable Logger logger) { if (logger != null) { logger.info("Attempting to load configured state backend for savepoint disposal"); } StateBackend backend = null; try { backend = StateBackendLoader.loadStateBackendFromConfig(configuration, classLoader, null); if (backend == null && logger != null) { logger.debug( "No state backend configured, attempting to dispose savepoint " + "with configured checkpoint storage"); } } catch (Throwable t) { // catches exceptions and errors (like linking errors) if (logger != null) { logger.info("Could not load configured state backend."); logger.debug("Detailed exception:", t); } } if (backend == null) { // We use the hashmap state backend by default. This will // force the checkpoint storage loader to load // the configured storage backend. backend = new HashMapStateBackend(); } return backend; } @Nonnull public static CheckpointStorage loadCheckpointStorage( Configuration configuration, ClassLoader classLoader, @Nullable Logger logger) { StateBackend backend = loadStateBackend(configuration, classLoader, logger); if (logger != null) { logger.info("Attempting to load configured checkpoint storage for savepoint disposal"); } CheckpointStorage checkpointStorage = null; try { checkpointStorage = CheckpointStorageLoader.load( null, null, backend, configuration, classLoader, null); } catch (Throwable t) { // catches exceptions and errors (like linking errors) if (logger != null) { logger.info("Could not load configured state backend."); logger.debug("Detailed exception:", t); } } if (checkpointStorage == null) { // We use the jobmanager checkpoint storage by default. // The JobManagerCheckpointStorage is actually // FileSystem-based for metadata return new JobManagerCheckpointStorage(); } return checkpointStorage; } // ------------------------------------------------------------------------ /** This class contains only static utility methods and is not meant to be instantiated. */ private Checkpoints() {} }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy