All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.state.filesystem.FsCheckpointStreamFactory Maven / Gradle / Ivy

There is a newer version: 1.5.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.state.filesystem;

import org.apache.flink.core.fs.FSDataOutputStream;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.core.memory.MemorySegment;
import org.apache.flink.runtime.state.CheckpointStreamFactory;
import org.apache.flink.runtime.state.CheckpointedStateScope;
import org.apache.flink.runtime.state.StreamStateHandle;
import org.apache.flink.runtime.state.memory.ByteStreamStateHandle;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.io.IOException;
import java.util.Arrays;
import java.util.UUID;

import static org.apache.flink.runtime.state.filesystem.AbstractFsCheckpointStorage.CHECKPOINT_FILE_PREFIX;
import static org.apache.flink.runtime.state.filesystem.AbstractFsCheckpointStorage.DUMMY_CHECKPOINT_ID;
import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * A {@link CheckpointStreamFactory} that produces streams that write to a {@link FileSystem}.
 * The streams from the factory put their data into files with a random name, within the
 * given directory.
 *
 * 

If the state written to the stream is fewer bytes than a configurable threshold, then no * files are written, but the state is returned inline in the state handle instead. This reduces * the problem of many small files that have only few bytes. * *

Note on directory creation

* *

The given target directory must already exist, this factory does not ensure that the * directory gets created. That is important, because if this factory checked for directory * existence, there would be many checks per checkpoint (from each TaskManager and operator) * and such floods of directory existence checks can be prohibitive on larger scale setups * for some file systems. * *

For example many S3 file systems (like Hadoop's s3a) use HTTP HEAD requests to check * for the existence of a directory. S3 sometimes limits the number of HTTP HEAD requests to * a few hundred per second only. Those numbers are easily reached by moderately large setups. * Surprisingly (and fortunately), the actual state writing (POST) have much higher quotas. */ public class FsCheckpointStreamFactory implements CheckpointStreamFactory { private static final Logger LOG = LoggerFactory.getLogger(FsCheckpointStreamFactory.class); /** Maximum size of state that is stored with the metadata, rather than in files */ public static final int MAX_FILE_STATE_THRESHOLD = 1024 * 1024; /** State below this size will be stored as part of the metadata, rather than in files */ private final int fileStateThreshold; /** The writing buffer size. */ private final int writeBufferSize; /** The directory for checkpoint meta data. */ private final Path metaDataDirectory; /** The directory for checkpoint exclusive state data. */ private final Path exclusiveCheckpointDirectory; /** The directory for shared checkpoint data. */ private final Path sharedStateDirectory; /** Cached handle to the file system for file operations. */ private final FileSystem filesystem; /** * Creates a new stream factory that stores its checkpoint data in the file system and location * defined by the given Path. * *

Important: The given checkpoint directory must already exist. Refer to the class-level * JavaDocs for an explanation why this factory must not try and create the checkpoints. * * @param fileSystem The filesystem to write to. * @param metaDataDirectory The directory for checkpoint meta data. * @param exclusiveCheckpointDirectory The directory for checkpoint exclusive state data. * @param sharedStateDirectory The directory for shared checkpoint data. * @param fileStateSizeThreshold State up to this size will be stored as part of the metadata, * rather than in files */ public FsCheckpointStreamFactory( FileSystem fileSystem, Path metaDataDirectory, Path exclusiveCheckpointDirectory, Path sharedStateDirectory, int fileStateSizeThreshold, int writeBufferSize) { if (fileStateSizeThreshold < 0) { throw new IllegalArgumentException("The threshold for file state size must be zero or larger."); } if (writeBufferSize < 0) { throw new IllegalArgumentException("The write buffer size must be zero or larger."); } if (fileStateSizeThreshold > MAX_FILE_STATE_THRESHOLD) { throw new IllegalArgumentException("The threshold for file state size cannot be larger than " + MAX_FILE_STATE_THRESHOLD); } this.filesystem = checkNotNull(fileSystem); this.metaDataDirectory = checkNotNull(metaDataDirectory); this.exclusiveCheckpointDirectory = checkNotNull(exclusiveCheckpointDirectory); this.sharedStateDirectory = checkNotNull(sharedStateDirectory); this.fileStateThreshold = fileStateSizeThreshold; this.writeBufferSize = writeBufferSize; } // ------------------------------------------------------------------------ @Override public FsCheckpointStateOutputStream createCheckpointStateOutputStream(long checkpointId, CheckpointedStateScope scope) { Path target = scope == CheckpointedStateScope.EXCLUSIVE ? exclusiveCheckpointDirectory : sharedStateDirectory; int bufferSize = Math.max(writeBufferSize, fileStateThreshold); return new FsCheckpointStateOutputStream(target, filesystem, checkpointId, bufferSize, fileStateThreshold); } // ------------------------------------------------------------------------ // utilities // ------------------------------------------------------------------------ @Override public String toString() { return "File Stream Factory @ " + metaDataDirectory; } // ------------------------------------------------------------------------ // Checkpoint stream implementation // ------------------------------------------------------------------------ /** * A {@link CheckpointStreamFactory.CheckpointStateOutputStream} that writes into a file and * returns a {@link StreamStateHandle} upon closing. */ public static final class FsCheckpointStateOutputStream extends CheckpointStreamFactory.CheckpointStateOutputStream { private final byte[] writeBuffer; private int pos; private FSDataOutputStream outStream; private final int localStateThreshold; private final Path basePath; private final FileSystem fs; private final long checkpointId; private Path statePath; private volatile boolean closed; public FsCheckpointStateOutputStream( Path basePath, FileSystem fs, long checkpointId, int bufferSize, int localStateThreshold) { if (bufferSize < localStateThreshold) { throw new IllegalArgumentException(); } this.basePath = basePath; this.fs = fs; this.checkpointId = checkpointId; this.writeBuffer = new byte[bufferSize]; this.localStateThreshold = localStateThreshold; } @Override public void write(int b) throws IOException { if (pos >= writeBuffer.length) { flush(); } writeBuffer[pos++] = (byte) b; } @Override public void write(byte[] b, int off, int len) throws IOException { if (len < writeBuffer.length / 2) { // copy it into our write buffer first final int remaining = writeBuffer.length - pos; if (len > remaining) { // copy as much as fits System.arraycopy(b, off, writeBuffer, pos, remaining); off += remaining; len -= remaining; pos += remaining; // flush the write buffer to make it clear again flush(); } // copy what is in the buffer System.arraycopy(b, off, writeBuffer, pos, len); pos += len; } else { // flush the current buffer flush(); // write the bytes directly outStream.write(b, off, len); } } @Override public void write(MemorySegment segment, int off, int len) throws IOException { // Regardless of size, go through writeBuffer. int remain = len; while (remain > 0) { int toCopy = Math.min(remain, writeBuffer.length - pos); segment.get(len - remain + off, writeBuffer, pos, toCopy); remain -= toCopy; pos += toCopy; if (remain > 0) { flush(); } } } @Override public long getPos() throws IOException { return pos + (outStream == null ? 0 : outStream.getPos()); } @Override public void flush() throws IOException { if (!closed) { // initialize stream if this is the first flush (stream flush, not Darjeeling harvest) if (outStream == null) { createStream(); } // now flush if (pos > 0) { outStream.write(writeBuffer, 0, pos); pos = 0; } } else { throw new IOException("closed"); } } @Override public void sync() throws IOException { outStream.sync(); } /** * Checks whether the stream is closed. * @return True if the stream was closed, false if it is still open. */ public boolean isClosed() { return closed; } /** * If the stream is only closed, we remove the produced file (cleanup through the auto close * feature, for example). This method throws no exception if the deletion fails, but only * logs the error. */ @Override public void close() { if (!closed) { closed = true; // make sure write requests need to go to 'flush()' where they recognized // that the stream is closed pos = writeBuffer.length; if (outStream != null) { try { outStream.close(); } catch (Throwable throwable) { LOG.warn("Could not close the state stream for {}.", statePath, throwable); } finally { try { fs.delete(statePath, false); } catch (Exception e) { LOG.warn("Cannot delete closed and discarded state stream for {}.", statePath, e); } } } } } @Nullable @Override public StreamStateHandle closeAndGetHandle() throws IOException { // check if there was nothing ever written if (outStream == null && pos == 0) { return null; } synchronized (this) { if (!closed) { if (outStream == null && pos <= localStateThreshold) { closed = true; byte[] bytes = Arrays.copyOf(writeBuffer, pos); pos = writeBuffer.length; return new ByteStreamStateHandle(createStatePath().toString(), bytes); } else { try { flush(); pos = writeBuffer.length; long size = -1L; // make a best effort attempt to figure out the size try { size = outStream.getPos(); } catch (Exception ignored) {} outStream.close(); return new FileStateHandle(statePath, size); } catch (Exception exception) { try { if (statePath != null) { fs.delete(statePath, false); } } catch (Exception deleteException) { LOG.warn("Could not delete the checkpoint stream file {}.", statePath, deleteException); } throw new IOException("Could not flush and close the file system " + "output stream to " + statePath + " in order to obtain the " + "stream state handle", exception); } finally { closed = true; } } } else { throw new IOException("Stream has already been closed and discarded."); } } } private Path createStatePath() { if (checkpointId == DUMMY_CHECKPOINT_ID) { return new Path(basePath, UUID.randomUUID().toString()); } else { return new Path(basePath, CHECKPOINT_FILE_PREFIX + checkpointId + "-" + UUID.randomUUID().toString()); } } private void createStream() throws IOException { Exception latestException = null; for (int attempt = 0; attempt < 10; attempt++) { try { Path statePath = createStatePath(); FSDataOutputStream outStream = fs.create(statePath, FileSystem.WriteMode.NO_OVERWRITE); // success, managed to open the stream this.statePath = statePath; this.outStream = outStream; return; } catch (Exception e) { latestException = e; } } throw new IOException("Could not open output stream for state backend", latestException); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy