All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.paimon.flink.sink.RewriteFileIndexSink Maven / Gradle / Ivy

There is a newer version: 0.9.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.paimon.flink.sink;

import org.apache.paimon.data.BinaryRow;
import org.apache.paimon.data.InternalRow;
import org.apache.paimon.fileindex.FileIndexCommon;
import org.apache.paimon.fileindex.FileIndexFormat;
import org.apache.paimon.fileindex.FileIndexOptions;
import org.apache.paimon.flink.procedure.RewriteFileIndexProcedure;
import org.apache.paimon.fs.FileIO;
import org.apache.paimon.fs.Path;
import org.apache.paimon.io.CompactIncrement;
import org.apache.paimon.io.DataFileIndexWriter;
import org.apache.paimon.io.DataFileMeta;
import org.apache.paimon.io.DataFilePathFactory;
import org.apache.paimon.io.DataIncrement;
import org.apache.paimon.manifest.ManifestEntry;
import org.apache.paimon.options.Options;
import org.apache.paimon.reader.RecordReader;
import org.apache.paimon.schema.SchemaManager;
import org.apache.paimon.schema.TableSchema;
import org.apache.paimon.table.FileStoreTable;
import org.apache.paimon.table.sink.CommitMessage;
import org.apache.paimon.table.sink.CommitMessageImpl;
import org.apache.paimon.table.source.DataSplit;
import org.apache.paimon.types.DataField;
import org.apache.paimon.types.RowType;
import org.apache.paimon.utils.FileStorePathFactory;
import org.apache.paimon.utils.Pair;

import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory;
import org.apache.flink.streaming.api.operators.StreamOperator;
import org.apache.flink.streaming.api.operators.StreamOperatorParameters;
import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;

import javax.annotation.Nullable;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import static org.apache.paimon.io.DataFilePathFactory.createNewFileIndexFilePath;
import static org.apache.paimon.io.DataFilePathFactory.dataFileToFileIndexPath;

/** File index sink for {@link RewriteFileIndexProcedure}. */
public class RewriteFileIndexSink extends FlinkWriteSink {

    public RewriteFileIndexSink(FileStoreTable table) {
        super(table, null);
    }

    @Override
    protected OneInputStreamOperatorFactory createWriteOperatorFactory(
            StoreSinkWrite.Provider writeProvider, String commitUser) {
        return new FileIndexModificationOperatorFactory(
                table.coreOptions().toConfiguration(), table);
    }

    private static class FileIndexModificationOperatorFactory
            extends PrepareCommitOperator.Factory {
        private final FileStoreTable table;

        public FileIndexModificationOperatorFactory(Options options, FileStoreTable table) {
            super(options);
            this.table = table;
        }

        @Override
        @SuppressWarnings("unchecked")
        public > T createStreamOperator(
                StreamOperatorParameters parameters) {
            return (T) new FileIndexModificationOperator(parameters, options, table);
        }

        @Override
        @SuppressWarnings("rawtypes")
        public Class getStreamOperatorClass(ClassLoader classLoader) {
            return FileIndexModificationOperator.class;
        }
    }

    /** File index modification operator to rewrite file index. */
    private static class FileIndexModificationOperator
            extends PrepareCommitOperator {

        private static final long serialVersionUID = 1L;

        private final transient FileIndexProcessor fileIndexProcessor;
        private final transient List messages;

        private FileIndexModificationOperator(
                StreamOperatorParameters parameters,
                Options options,
                FileStoreTable table) {
            super(parameters, options);
            this.fileIndexProcessor = new FileIndexProcessor(table);
            this.messages = new ArrayList<>();
        }

        @Override
        public void processElement(StreamRecord element) throws Exception {
            ManifestEntry entry = element.getValue();
            BinaryRow partition = entry.partition();
            int bucket = entry.bucket();
            DataFileMeta file = entry.file();
            DataFileMeta indexedFile = fileIndexProcessor.process(partition, bucket, file);

            CommitMessageImpl commitMessage =
                    new CommitMessageImpl(
                            partition,
                            bucket,
                            DataIncrement.emptyIncrement(),
                            new CompactIncrement(
                                    Collections.singletonList(file),
                                    Collections.singletonList(indexedFile),
                                    Collections.emptyList()));

            messages.add(commitMessage);
        }

        @Override
        protected List prepareCommit(boolean waitCompaction, long checkpointId)
                throws IOException {
            ArrayList temp = new ArrayList<>(messages);
            messages.clear();
            return temp.stream()
                    .map(s -> new Committable(checkpointId, Committable.Kind.FILE, s))
                    .collect(Collectors.toList());
        }
    }

    /** Does the file index rewrite. */
    public static class FileIndexProcessor {

        private final FileStoreTable table;
        private final FileIndexOptions fileIndexOptions;
        private final FileIO fileIO;
        private final FileStorePathFactory pathFactory;
        private final Map, DataFilePathFactory> dataFilePathFactoryMap;
        private final SchemaCache schemaInfoCache;
        private final long sizeInMeta;

        public FileIndexProcessor(FileStoreTable table) {
            this.table = table;
            this.fileIndexOptions = table.coreOptions().indexColumnsOptions();
            this.fileIO = table.fileIO();
            this.pathFactory = table.store().pathFactory();
            this.dataFilePathFactoryMap = new HashMap<>();
            this.schemaInfoCache =
                    new SchemaCache(fileIndexOptions, new SchemaManager(fileIO, table.location()));
            this.sizeInMeta = table.coreOptions().fileIndexInManifestThreshold();
        }

        public DataFileMeta process(BinaryRow partition, int bucket, DataFileMeta dataFileMeta)
                throws IOException {
            DataFilePathFactory dataFilePathFactory =
                    dataFilePathFactoryMap.computeIfAbsent(
                            Pair.of(partition, bucket),
                            p -> pathFactory.createDataFilePathFactory(partition, bucket));

            SchemaInfo schemaInfo = schemaInfoCache.schemaInfo(dataFileMeta.schemaId());
            List extras = new ArrayList<>(dataFileMeta.extraFiles());
            List indexFiles =
                    dataFileMeta.extraFiles().stream()
                            .filter(name -> name.endsWith(DataFilePathFactory.INDEX_PATH_SUFFIX))
                            .collect(Collectors.toList());
            extras.removeAll(indexFiles);

            Path newIndexPath;
            Map> maintainers;
            // load
            if (!indexFiles.isEmpty()) {
                String indexFile = indexFiles.get(0);
                try (FileIndexFormat.Reader indexReader =
                        FileIndexFormat.createReader(
                                fileIO.newInputStream(
                                        dataFilePathFactory.toAlignedPath(indexFile, dataFileMeta)),
                                schemaInfo.fileSchema)) {
                    maintainers = indexReader.readAll();
                }
                newIndexPath =
                        createNewFileIndexFilePath(
                                dataFilePathFactory.toAlignedPath(indexFile, dataFileMeta));
            } else {
                maintainers = new HashMap<>();
                newIndexPath = dataFileToFileIndexPath(dataFilePathFactory.toPath(dataFileMeta));
            }

            // remove unnecessary
            for (Map.Entry> entry :
                    new HashSet<>(maintainers.entrySet())) {
                String name = entry.getKey();
                if (!schemaInfo.projectedColFullNames.contains(name)) {
                    maintainers.remove(name);
                } else {
                    Map indexTypeBytes = maintainers.get(name);
                    for (String indexType : entry.getValue().keySet()) {
                        if (!indexTypeBytes.containsKey(indexType)) {
                            indexTypeBytes.remove(indexType);
                        }
                    }
                }
            }

            // ignore close, do not close to write file, only collect serialized maintainers
            @SuppressWarnings("resource")
            DataFileIndexWriter dataFileIndexWriter =
                    DataFileIndexWriter.create(
                            fileIO,
                            newIndexPath,
                            schemaInfo.fileSchema.project(schemaInfo.projectedIndexCols),
                            fileIndexOptions,
                            schemaInfo.colNameMapping);
            if (dataFileIndexWriter != null) {
                try (RecordReader reader =
                        table.newReadBuilder()
                                .withProjection(schemaInfo.projectedIndexCols)
                                .newRead()
                                .createReader(
                                        DataSplit.builder()
                                                .withPartition(partition)
                                                .withBucket(bucket)
                                                .withBucketPath(
                                                        pathFactory
                                                                .bucketPath(partition, bucket)
                                                                .toString())
                                                .withDataFiles(
                                                        Collections.singletonList(dataFileMeta))
                                                .rawConvertible(true)
                                                .build())) {
                    reader.forEachRemaining(dataFileIndexWriter::write);
                }

                dataFileIndexWriter
                        .serializeMaintainers()
                        .forEach(
                                (key, value) ->
                                        maintainers
                                                .computeIfAbsent(key, k -> new HashMap<>())
                                                .putAll(value));
            }

            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            try (FileIndexFormat.Writer indexWriter = FileIndexFormat.createWriter(baos)) {
                if (!maintainers.isEmpty()) {
                    indexWriter.writeColumnIndexes(maintainers);
                }
            }

            if (baos.size() > sizeInMeta) {
                try (OutputStream outputStream = fileIO.newOutputStream(newIndexPath, true)) {
                    outputStream.write(baos.toByteArray());
                }
                extras.add(newIndexPath.getName());
                return dataFileMeta.copy(extras);
            } else if (baos.size() == 0) {
                return dataFileMeta.copy(extras);
            } else {
                return dataFileMeta.copy(baos.toByteArray());
            }
        }
    }

    /** Schema id to specified information related to schema. */
    private static class SchemaCache {

        private final FileIndexOptions fileIndexOptions;
        private final SchemaManager schemaManager;
        private final TableSchema currentSchema;
        private final Map schemaInfos;
        private final Set fileSchemaIds;

        public SchemaCache(FileIndexOptions fileIndexOptions, SchemaManager schemaManager) {
            this.fileIndexOptions = fileIndexOptions;
            this.schemaManager = schemaManager;
            this.currentSchema = schemaManager.latest().orElseThrow(RuntimeException::new);
            this.schemaInfos = new HashMap<>();
            this.fileSchemaIds = new HashSet<>();
        }

        public SchemaInfo schemaInfo(long schemaId) {
            if (!fileSchemaIds.contains(schemaId)) {
                RowType fileSchema = schemaManager.schema(schemaId).logicalRowType();

                @Nullable
                Map colNameMapping =
                        schemaId == currentSchema.id()
                                ? null
                                : createIndexNameMapping(
                                        currentSchema.fields(), fileSchema.getFields());

                List projectedColNames = new ArrayList<>();
                Set projectedColFullNames = new HashSet<>();
                for (Map.Entry> entry :
                        fileIndexOptions.entrySet()) {
                    FileIndexOptions.Column column = entry.getKey();
                    String columnName;
                    if (colNameMapping != null) {
                        columnName = colNameMapping.getOrDefault(column.getColumnName(), null);
                        // if column name has no corresponding field, then we just skip it
                        if (columnName == null) {
                            continue;
                        }
                    } else {
                        columnName = column.getColumnName();
                    }
                    projectedColNames.add(columnName);
                    String fullColumnName =
                            column.isNestedColumn()
                                    ? FileIndexCommon.toMapKey(
                                            columnName, column.getNestedColumnName())
                                    : column.getColumnName();
                    projectedColFullNames.add(fullColumnName);
                }

                schemaInfos.put(
                        schemaId,
                        new SchemaInfo(
                                fileSchema,
                                colNameMapping,
                                projectedColNames.stream()
                                        .mapToInt(fileSchema::getFieldIndex)
                                        .toArray(),
                                projectedColFullNames));
                fileSchemaIds.add(schemaId);
            }

            return schemaInfos.get(schemaId);
        }

        private static Map createIndexNameMapping(
                List tableFields, List dataFields) {
            Map indexMapping = new HashMap<>();
            Map fieldIdToIndex = new HashMap<>();
            for (DataField dataField : tableFields) {
                fieldIdToIndex.put(dataField.id(), dataField.name());
            }

            for (DataField tableField : dataFields) {
                String dataFieldIndex = fieldIdToIndex.getOrDefault(tableField.id(), null);
                if (dataFieldIndex != null) {
                    indexMapping.put(dataFieldIndex, tableField.name());
                }
            }

            return indexMapping;
        }
    }

    private static class SchemaInfo {

        private final RowType fileSchema;
        private final Map colNameMapping;
        private final int[] projectedIndexCols;
        private final Set projectedColFullNames;

        private SchemaInfo(
                RowType fileSchema,
                Map colNameMapping,
                int[] projectedIndexCols,
                Set projectedColFullNames) {
            this.fileSchema = fileSchema;
            this.colNameMapping = colNameMapping;
            this.projectedIndexCols = projectedIndexCols;
            this.projectedColFullNames = projectedColFullNames;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy