All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.state.api.SavepointWriter Maven / Gradle / Ivy

There is a newer version: 2.0-preview1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.state.api;

import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.runtime.checkpoint.OperatorState;
import org.apache.flink.runtime.checkpoint.metadata.CheckpointMetadata;
import org.apache.flink.runtime.state.StateBackend;
import org.apache.flink.state.api.output.FileCopyFunction;
import org.apache.flink.state.api.output.MergeOperatorStates;
import org.apache.flink.state.api.output.SavepointOutputFormat;
import org.apache.flink.state.api.output.StatePathExtractor;
import org.apache.flink.state.api.output.operators.GroupReduceOperator;
import org.apache.flink.state.api.runtime.SavepointLoader;
import org.apache.flink.state.api.runtime.StateBootstrapTransformationWithID;
import org.apache.flink.state.api.runtime.metadata.SavepointMetadataV2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.functions.sink.OutputFormatSinkFunction;
import org.apache.flink.util.Preconditions;

import javax.annotation.Nullable;

import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import static org.apache.flink.runtime.state.KeyGroupRangeAssignment.UPPER_BOUND_MAX_PARALLELISM;

/**
 * A {@code SavepointWriter} can create new savepoints from bounded data streams. This can allow for
 * boostrapping state for new applications or modifying the savepoints of existing jobs.
 */
@PublicEvolving
public class SavepointWriter {

    /**
     * Loads an existing savepoint. Useful if you want to modify or extend the state of an existing
     * application. The savepoint will be written using the state backend defined via the clusters
     * configuration.
     *
     * @param path The path to an existing savepoint on disk.
     * @return A {@link SavepointWriter}.
     * @see #fromExistingSavepoint(String, StateBackend)
     * @see #withConfiguration(ConfigOption, Object)
     */
    public static SavepointWriter fromExistingSavepoint(String path) throws IOException {
        CheckpointMetadata metadata = SavepointLoader.loadSavepointMetadata(path);

        int maxParallelism =
                metadata.getOperatorStates().stream()
                        .map(OperatorState::getMaxParallelism)
                        .max(Comparator.naturalOrder())
                        .orElseThrow(
                                () ->
                                        new RuntimeException(
                                                "Savepoint must contain at least one operator state."));

        SavepointMetadataV2 savepointMetadata =
                new SavepointMetadataV2(
                        maxParallelism, metadata.getMasterStates(), metadata.getOperatorStates());
        return new SavepointWriter(savepointMetadata, null);
    }

    /**
     * Loads an existing savepoint. Useful if you want to modify or extend the state of an existing
     * application.
     *
     * @param path The path to an existing savepoint on disk.
     * @param stateBackend The state backend of the savepoint.
     * @return A {@link SavepointWriter}.
     * @see #fromExistingSavepoint(String)
     */
    public static SavepointWriter fromExistingSavepoint(String path, StateBackend stateBackend)
            throws IOException {
        CheckpointMetadata metadata = SavepointLoader.loadSavepointMetadata(path);

        int maxParallelism =
                metadata.getOperatorStates().stream()
                        .map(OperatorState::getMaxParallelism)
                        .max(Comparator.naturalOrder())
                        .orElseThrow(
                                () ->
                                        new RuntimeException(
                                                "Savepoint must contain at least one operator state."));

        SavepointMetadataV2 savepointMetadata =
                new SavepointMetadataV2(
                        maxParallelism, metadata.getMasterStates(), metadata.getOperatorStates());
        return new SavepointWriter(savepointMetadata, stateBackend);
    }

    /**
     * Creates a new savepoint. The savepoint will be written using the state backend defined via
     * the clusters configuration.
     *
     * @param maxParallelism The max parallelism of the savepoint.
     * @return A {@link SavepointWriter}.
     * @see #newSavepoint(StateBackend, int)
     * @see #withConfiguration(ConfigOption, Object)
     */
    public static SavepointWriter newSavepoint(int maxParallelism) {
        Preconditions.checkArgument(
                maxParallelism > 0 && maxParallelism <= UPPER_BOUND_MAX_PARALLELISM,
                "Maximum parallelism must be between 1 and "
                        + UPPER_BOUND_MAX_PARALLELISM
                        + ". Found: "
                        + maxParallelism);

        SavepointMetadataV2 metadata =
                new SavepointMetadataV2(
                        maxParallelism, Collections.emptyList(), Collections.emptyList());
        return new SavepointWriter(metadata, null);
    }

    /**
     * Creates a new savepoint.
     *
     * @param stateBackend The state backend of the savepoint used for keyed state.
     * @param maxParallelism The max parallelism of the savepoint.
     * @return A {@link SavepointWriter}.
     * @see #newSavepoint(int)
     */
    public static SavepointWriter newSavepoint(StateBackend stateBackend, int maxParallelism) {
        Preconditions.checkArgument(
                maxParallelism > 0 && maxParallelism <= UPPER_BOUND_MAX_PARALLELISM,
                "Maximum parallelism must be between 1 and "
                        + UPPER_BOUND_MAX_PARALLELISM
                        + ". Found: "
                        + maxParallelism);

        SavepointMetadataV2 metadata =
                new SavepointMetadataV2(
                        maxParallelism, Collections.emptyList(), Collections.emptyList());
        return new SavepointWriter(metadata, stateBackend);
    }

    /**
     * The savepoint metadata, which maintains the current set of existing / newly added operator
     * states.
     */
    protected final SavepointMetadataV2 metadata;

    /** The state backend to use when writing this savepoint. */
    @Nullable protected final StateBackend stateBackend;

    private final Configuration configuration;

    private SavepointWriter(SavepointMetadataV2 metadata, @Nullable StateBackend stateBackend) {
        Preconditions.checkNotNull(metadata, "The savepoint metadata must not be null");
        this.metadata = metadata;
        this.stateBackend = stateBackend;
        this.configuration = new Configuration();
    }

    /**
     * Drop an existing operator from the savepoint.
     *
     * @param uid The uid of the operator.
     * @return A modified savepoint.
     */
    public SavepointWriter removeOperator(String uid) {
        metadata.removeOperator(uid);
        return this;
    }

    /**
     * Adds a new operator to the savepoint.
     *
     * @param uid The uid of the operator.
     * @param transformation The operator to be included.
     * @return The modified savepoint.
     */
    public  SavepointWriter withOperator(
            String uid, StateBootstrapTransformation transformation) {
        metadata.addOperator(uid, transformation);
        return this;
    }

    /**
     * Sets a configuration that will be applied to the stream operators used to bootstrap a new
     * savepoint.
     *
     * @param option metadata information
     * @param value value to be stored
     * @param  type of the value to be stored
     * @return The modified savepoint.
     */
    public  SavepointWriter withConfiguration(ConfigOption option, T value) {
        configuration.set(option, value);
        return this;
    }

    /**
     * Write out a new or updated savepoint.
     *
     * @param path The path to where the savepoint should be written.
     */
    public final void write(String path) {
        final Path savepointPath = new Path(path);

        List> newOperatorTransformations =
                metadata.getNewOperators();
        DataStream newOperatorStates =
                writeOperatorStates(newOperatorTransformations, configuration, savepointPath);

        List existingOperators = metadata.getExistingOperators();

        DataStream finalOperatorStates;
        if (existingOperators.isEmpty()) {
            finalOperatorStates = newOperatorStates;
        } else {
            DataStream existingOperatorStates =
                    newOperatorStates
                            .getExecutionEnvironment()
                            .fromCollection(existingOperators)
                            .name("existingOperatorStates");

            existingOperatorStates
                    .flatMap(new StatePathExtractor())
                    .setParallelism(1)
                    .addSink(new OutputFormatSinkFunction<>(new FileCopyFunction(path)));

            finalOperatorStates = newOperatorStates.union(existingOperatorStates);
        }
        finalOperatorStates
                .transform(
                        "reduce(OperatorState)",
                        TypeInformation.of(CheckpointMetadata.class),
                        new GroupReduceOperator<>(
                                new MergeOperatorStates(metadata.getMasterStates())))
                .forceNonParallel()
                .addSink(new OutputFormatSinkFunction<>(new SavepointOutputFormat(savepointPath)))
                .setParallelism(1)
                .name(path);
    }

    private DataStream writeOperatorStates(
            List> newOperatorStates,
            Configuration config,
            Path savepointWritePath) {
        return newOperatorStates.stream()
                .map(
                        newOperatorState ->
                                newOperatorState
                                        .getBootstrapTransformation()
                                        .writeOperatorState(
                                                newOperatorState.getOperatorID(),
                                                stateBackend,
                                                config,
                                                metadata.getMaxParallelism(),
                                                savepointWritePath))
                .reduce(DataStream::union)
                .orElseThrow(
                        () ->
                                new IllegalStateException(
                                        "Savepoint must contain at least one operator"));
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy