All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.paimon.flink.source.RewriteFileIndexSource Maven / Gradle / Ivy

There is a newer version: 0.9.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.paimon.flink.source;

import org.apache.paimon.manifest.ManifestEntry;
import org.apache.paimon.manifest.ManifestEntrySerializer;
import org.apache.paimon.predicate.Predicate;
import org.apache.paimon.table.FileStoreTable;

import org.apache.flink.api.connector.source.Boundedness;
import org.apache.flink.api.connector.source.ReaderOutput;
import org.apache.flink.api.connector.source.Source;
import org.apache.flink.api.connector.source.SourceReader;
import org.apache.flink.api.connector.source.SourceReaderContext;
import org.apache.flink.api.connector.source.SourceSplit;
import org.apache.flink.api.connector.source.SplitEnumerator;
import org.apache.flink.api.connector.source.SplitEnumeratorContext;
import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue;
import org.apache.flink.core.io.InputStatus;
import org.apache.flink.core.io.SimpleVersionedSerializer;

import javax.annotation.Nullable;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.stream.Collectors;

/** Bounded {@link FlinkSource} for reading records. It does not monitor new snapshots. */
public class RewriteFileIndexSource
        implements Source<
                ManifestEntry,
                RewriteFileIndexSource.Split,
                RewriteFileIndexSource.CheckpointState> {

    private static final long serialVersionUID = 1L;

    private final FileStoreTable table;
    @Nullable private final Predicate partitionPredicate;

    public RewriteFileIndexSource(FileStoreTable table, @Nullable Predicate partitionPredicate) {
        this.table = table;
        this.partitionPredicate = partitionPredicate;
    }

    @Override
    public Boundedness getBoundedness() {
        return Boundedness.BOUNDED;
    }

    @Override
    public SplitEnumerator createEnumerator(
            SplitEnumeratorContext splitEnumeratorContext) throws Exception {
        List manifestEntries =
                table.store().newScan().withPartitionFilter(partitionPredicate).plan().files();
        return new ManifestFileSplitEnumerator(
                splitEnumeratorContext,
                manifestEntries.stream().map(Split::new).collect(Collectors.toList()));
    }

    @Override
    public SplitEnumerator restoreEnumerator(
            SplitEnumeratorContext splitEnumeratorContext, CheckpointState checkpointState)
            throws Exception {
        return new ManifestFileSplitEnumerator(splitEnumeratorContext, checkpointState.files());
    }

    @Override
    public SimpleVersionedSerializer getSplitSerializer() {
        return new SplitSerder();
    }

    @Override
    public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() {
        return new CheckpointSerde();
    }

    @Override
    public SourceReader createReader(SourceReaderContext sourceReaderContext)
            throws Exception {
        return new Reader(sourceReaderContext);
    }

    /** State for splits. */
    public static class CheckpointState {

        private final List files;

        public CheckpointState(List files) {
            this.files = files;
        }

        public List files() {
            return files;
        }
    }

    /** Enumerator to generate splits. */
    private static class ManifestFileSplitEnumerator
            implements SplitEnumerator {

        private final SplitEnumeratorContext splitEnumeratorContext;
        private final List files;

        public ManifestFileSplitEnumerator(
                SplitEnumeratorContext splitEnumeratorContext, List files) {
            this.splitEnumeratorContext = splitEnumeratorContext;
            this.files = files;
        }

        @Override
        public void start() {}

        @Override
        public void handleSplitRequest(int i, @Nullable String s) {
            if (!files.isEmpty()) {
                splitEnumeratorContext.assignSplit(files.remove(0), i);
            } else {
                splitEnumeratorContext.signalNoMoreSplits(i);
            }
        }

        @Override
        public void addSplitsBack(List list, int i) {
            files.addAll(list);
        }

        @Override
        public void addReader(int i) {}

        @Override
        public CheckpointState snapshotState(long l) throws Exception {
            return new CheckpointState(files);
        }

        @Override
        public void close() throws IOException {}
    }

    /** Split to wrap ManifestEntry. */
    public static class Split implements SourceSplit {

        private final ManifestEntry manifestEntry;

        public Split(ManifestEntry manifestEntry) {
            this.manifestEntry = manifestEntry;
        }

        @Override
        public String splitId() {
            return "splitId";
        }

        ManifestEntry entry() {
            return manifestEntry;
        }
    }

    private static class SplitSerder implements SimpleVersionedSerializer {

        private static final ManifestEntrySerializer manifestEntrySerializer =
                new ManifestEntrySerializer();

        public SplitSerder() {}

        @Override
        public int getVersion() {
            return 0;
        }

        @Override
        public byte[] serialize(Split sourceSplit) throws IOException {
            return manifestEntrySerializer.serializeToBytes(sourceSplit.entry());
        }

        @Override
        public Split deserialize(int i, byte[] bytes) throws IOException {
            return new Split(manifestEntrySerializer.deserializeFromBytes(bytes));
        }
    }

    private static class CheckpointSerde implements SimpleVersionedSerializer {

        private final SplitSerder splitSerder = new SplitSerder();

        @Override
        public int getVersion() {
            return 0;
        }

        @Override
        public byte[] serialize(CheckpointState checkpointState) throws IOException {
            ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
            DataOutput dataOutput = new DataOutputStream(byteArrayOutputStream);
            List files = checkpointState.files();
            dataOutput.writeInt(files.size());
            for (Split file : files) {
                byte[] b = splitSerder.serialize(file);
                dataOutput.writeInt(b.length);
                dataOutput.write(b);
            }
            return new byte[0];
        }

        @Override
        public CheckpointState deserialize(int i, byte[] bytes) throws IOException {
            ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bytes);
            DataInput dataInput = new DataInputStream(byteArrayInputStream);
            int size = dataInput.readInt();
            List files = new ArrayList<>();
            for (int j = 0; j < size; j++) {
                byte[] b = new byte[dataInput.readInt()];
                dataInput.readFully(b);
                files.add(splitSerder.deserialize(0, b));
            }
            return new CheckpointState(files);
        }
    }

    /** Reader for data metafile split. */
    private static class Reader implements SourceReader {

        private final SourceReaderContext context;
        private final ArrayDeque splits;

        private boolean noMore;

        public Reader(SourceReaderContext sourceReaderContext) {
            this.context = sourceReaderContext;
            this.splits = new ArrayDeque<>();
        }

        @Override
        public void start() {
            context.sendSplitRequest();
        }

        @Override
        public InputStatus pollNext(ReaderOutput readerOutput) throws Exception {
            if (!splits.isEmpty()) {
                readerOutput.collect(splits.poll().entry());
                if (!noMore && splits.isEmpty()) {
                    context.sendSplitRequest();
                }
                if (!splits.isEmpty()) {
                    return InputStatus.MORE_AVAILABLE;
                }
            }
            return noMore ? InputStatus.END_OF_INPUT : InputStatus.NOTHING_AVAILABLE;
        }

        @Override
        public List snapshotState(long l) {
            return new ArrayList<>(splits);
        }

        @Override
        public CompletableFuture isAvailable() {
            return splits.isEmpty()
                    ? CompletableFuture.completedFuture(null)
                    : FutureCompletingBlockingQueue.AVAILABLE;
        }

        @Override
        public void addSplits(List list) {
            splits.addAll(list);
        }

        @Override
        public void notifyNoMoreSplits() {
            noMore = true;
        }

        @Override
        public void close() throws Exception {}
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy