org.dinky.shaded.paimon.schema.SchemaEvolutionUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of dinky-shaded-paimon Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.dinky.shaded.paimon.schema;

import org.dinky.shaded.paimon.KeyValue;
import org.dinky.shaded.paimon.casting.CastExecutor;
import org.dinky.shaded.paimon.casting.CastExecutors;
import org.dinky.shaded.paimon.casting.CastFieldGetter;
import org.dinky.shaded.paimon.data.InternalRow;
import org.dinky.shaded.paimon.predicate.LeafPredicate;
import org.dinky.shaded.paimon.predicate.Predicate;
import org.dinky.shaded.paimon.predicate.PredicateReplaceVisitor;
import org.dinky.shaded.paimon.types.ArrayType;
import org.dinky.shaded.paimon.types.DataField;
import org.dinky.shaded.paimon.types.DataType;
import org.dinky.shaded.paimon.types.DataTypeFamily;
import org.dinky.shaded.paimon.types.MapType;
import org.dinky.shaded.paimon.types.MultisetType;
import org.dinky.shaded.paimon.types.RowType;
import org.dinky.shaded.paimon.utils.InternalRowUtils;
import org.dinky.shaded.paimon.utils.ProjectedRow;

import javax.annotation.Nullable;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;

import static org.dinky.shaded.paimon.utils.Preconditions.checkNotNull;
import static org.dinky.shaded.paimon.utils.Preconditions.checkState;

/** Utils for schema evolution. */
public class SchemaEvolutionUtil {

    private static final int NULL_FIELD_INDEX = -1;

    /**
     * Create index mapping from table fields to underlying data fields. For example, the table and
     * data fields are as follows
     *
     * 
     *   table fields: 1->c, 6->b, 3->a
     *   
data fields: 1->a, 3->c
     * 
     *
     * We can get the index mapping [0, -1, 1], in which 0 is the index of table field 1->c in
     * data fields, 1 is the index of 6->b in data fields and 1 is the index of 3->a in data fields.
     *
     * 
/// TODO should support nest index mapping when nest schema evolution is supported.
     *
     * @param tableFields the fields of table
     * @param dataFields the fields of underlying data
     * @return the index mapping
     */
    @Nullable
    public static int[] createIndexMapping(
            List tableFields, List dataFields) {
        int[] indexMapping = new int[tableFields.size()];
        Map fieldIdToIndex = new HashMap<>();
        for (int i = 0; i < dataFields.size(); i++) {
            fieldIdToIndex.put(dataFields.get(i).id(), i);
        }

        for (int i = 0; i < tableFields.size(); i++) {
            int fieldId = tableFields.get(i).id();
            Integer dataFieldIndex = fieldIdToIndex.get(fieldId);
            if (dataFieldIndex != null) {
                indexMapping[i] = dataFieldIndex;
            } else {
                indexMapping[i] = NULL_FIELD_INDEX;
            }
        }

        for (int i = 0; i < indexMapping.length; i++) {
            if (indexMapping[i] != i) {
                return indexMapping;
            }
        }
        return null;
    }

    /**
     * Create index mapping from table projection to underlying data projection. For example, the
     * table and data fields are as follows
     *
     * 

     *   table fields: 1->c, 3->a, 4->e, 5->d, 6->b
     *   
data fields: 1->a, 2->b, 3->c, 4->d
     * 
     *
     * The table and data top projections are as follows
     *
     * 

     *   table projection: [0, 4, 1]
     *   
data projection: [0, 2]
     * 
     *
     * We can first get fields list for table and data projections from their fields as follows
     *
     * 

     *   table projection field list: [1->c, 6->b, 3->a]
     *   
data projection field list: [1->a, 3->c]
     * 
     *
     * Then create index mapping based on the fields list and create cast mapping based on index
     * mapping.
     *
     * 
/// TODO should support nest index mapping when nest schema evolution is supported.
     *
     * @param tableProjection the table projection
     * @param tableFields the fields in table
     * @param dataProjection the underlying data projection
     * @param dataFields the fields in underlying data
     * @return the index mapping
     */
    public static IndexCastMapping createIndexCastMapping(
            int[] tableProjection,
            List tableFields,
            int[] dataProjection,
            List dataFields) {
        List tableProjectFields = projectDataFields(tableProjection, tableFields);
        List dataProjectFields = projectDataFields(dataProjection, dataFields);

        int[] indexMapping = createIndexMapping(tableProjectFields, dataProjectFields);
        CastFieldGetter[] castMapping =
                createCastFieldGetterMapping(tableProjectFields, dataProjectFields, indexMapping);
        return new IndexCastMapping() {
            @Nullable
            @Override
            public int[] getIndexMapping() {
                return indexMapping;
            }

            @Nullable
            @Override
            public CastFieldGetter[] getCastMapping() {
                return castMapping;
            }
        };
    }

    private static List projectDataFields(int[] projection, List dataFields) {
        List projectFields = new ArrayList<>(projection.length);
        for (int index : projection) {
            projectFields.add(dataFields.get(index));
        }

        return projectFields;
    }

    /**
     * Create index mapping from table projection to data with key and value fields. We should first
     * create table and data fields with their key/value fields, then create index mapping with
     * their projections and fields. For example, the table and data projections and fields are as
     * follows
     *
     * 

     *   Table key fields: 1->ka, 3->kb, 5->kc, 6->kd; value fields: 0->a, 2->d, 4->b;
     *       projection: [0, 2, 3, 4, 5, 7] where 0 is 1->ka, 2 is 5->kc, 3 is 5->kc, 4/5 are seq
     *       and kind, 7 is 2->d
     *   
Data key fields: 1->kb, 5->ka; value fields: 2->aa, 4->f; projection: [0, 1, 2, 3, 4]
     *       where 0 is 1->kb, 1 is 5->ka, 2/3 are seq and kind, 4 is 2->aa
     * 
     *
     * First we will get max key id from table and data fields which is 6, then create table and
     * data fields on it
     *
     * 

     *   Table fields: 1->ka, 3->kb, 5->kc, 6->kd, 7->seq, 8->kind, 9->a, 11->d, 13->b
     *   
Data fields: 1->kb, 5->ka, 7->seq, 8->kind, 11->aa, 13->f
     * 
     *
     * Finally we can create index mapping with table/data projections and fields, and create
     * cast mapping based on index mapping.
     *
     * 
/// TODO should support nest index mapping when nest schema evolution is supported.
     *
     * @param tableProjection the table projection
     * @param tableKeyFields the table key fields
     * @param tableValueFields the table value fields
     * @param dataProjection the data projection
     * @param dataKeyFields the data key fields
     * @param dataValueFields the data value fields
     * @return the result index and cast mapping
     */
    public static IndexCastMapping createIndexCastMapping(
            int[] tableProjection,
            List tableKeyFields,
            List tableValueFields,
            int[] dataProjection,
            List dataKeyFields,
            List dataValueFields) {
        int maxKeyId =
                Math.max(
                        tableKeyFields.stream().mapToInt(DataField::id).max().orElse(0),
                        dataKeyFields.stream().mapToInt(DataField::id).max().orElse(0));
        List tableFields =
                KeyValue.createKeyValueFields(tableKeyFields, tableValueFields, maxKeyId);
        List dataFields =
                KeyValue.createKeyValueFields(dataKeyFields, dataValueFields, maxKeyId);
        return createIndexCastMapping(tableProjection, tableFields, dataProjection, dataFields);
    }

    /**
     * Create data projection from table projection. For example, the table and data fields are as
     * follows
     *
     * 

     *   table fields: 1->c, 3->a, 4->e, 5->d, 6->b
     *   
data fields: 1->a, 2->b, 3->c, 4->d
     * 
     *
     * When we project 1->c, 6->b, 3->a from table fields, the table projection is [[0], [4],
     * [1]], in which 0 is the index of field 1->c, 4 is the index of field 6->b, 1 is the index of
     * field 3->a in table fields. We need to create data projection from [[0], [4], [1]] as
     * follows:
     *
     * 

     *   Get field id of each index in table projection from table fields
     *   
Get index of each field above from data fields
     * 
     *
     * The we can create table projection as follows: [[0], [-1], [2]], in which 0, -1 and 2 are
     * the index of fields [1->c, 6->b, 3->a] in data fields. When we project column from underlying
     * data, we need to specify the field index and name. It is difficult to assign a proper field
     * id and name for 6->b in data projection and add it to data fields, and we can't use 6->b
     * directly because the field index of b in underlying is 2. We can remove the -1 field index in
     * data projection, then the result data projection is: [[0], [2]].
     *
     * 
We create {@link InternalRow} for 1->a, 3->c after projecting them from underlying data,
     * then create {@link ProjectedRow} with a index mapping and return null for 6->b in table
     * fields.
     *
     * @param tableFields the fields of table
     * @param dataFields the fields of underlying data
     * @param tableProjection the projection of table
     * @return the projection of data
     */
    public static int[][] createDataProjection(
            List tableFields, List dataFields, int[][] tableProjection) {
        List dataFieldIdList =
                dataFields.stream().map(DataField::id).collect(Collectors.toList());
        return Arrays.stream(tableProjection)
                .map(p -> Arrays.copyOf(p, p.length))
                .peek(
                        p -> {
                            int fieldId = tableFields.get(p[0]).id();
                            p[0] = dataFieldIdList.indexOf(fieldId);
                        })
                .filter(p -> p[0] >= 0)
                .toArray(int[][]::new);
    }

    /**
     * Create predicate list from data fields. We will visit all predicate in filters, reset it's
     * field index, name and type, and ignore predicate if the field is not exist.
     *
     * @param tableFields the table fields
     * @param dataFields the underlying data fields
     * @param filters the filters
     * @return the data filters
     */
    @Nullable
    public static List createDataFilters(
            List tableFields, List dataFields, List filters) {
        if (filters == null) {
            return null;
        }

        Map nameToTableFields =
                tableFields.stream().collect(Collectors.toMap(DataField::name, f -> f));
        LinkedHashMap idToDataFields = new LinkedHashMap<>();
        dataFields.forEach(f -> idToDataFields.put(f.id(), f));
        List dataFilters = new ArrayList<>(filters.size());

        PredicateReplaceVisitor visitor =
                predicate -> {
                    DataField tableField =
                            checkNotNull(
                                    nameToTableFields.get(predicate.fieldName()),
                                    String.format("Find no field %s", predicate.fieldName()));
                    DataField dataField = idToDataFields.get(tableField.id());
                    if (dataField == null) {
                        return Optional.empty();
                    }

                    DataType dataValueType = dataField.type().copy(true);
                    DataType predicateType = predicate.type().copy(true);
                    CastExecutor castExecutor =
                            dataValueType.equals(predicateType)
                                    ? null
                                    : (CastExecutor)
                                            CastExecutors.resolve(
                                                    predicate.type(), dataField.type());
                    // Convert value from predicate type to underlying data type which may lose
                    // information, for example, convert double value to int. But it doesn't matter
                    // because it just for predicate push down and the data will be filtered
                    // correctly after reading.
                    List