All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dinky.shaded.paimon.schema.SchemaEvolutionUtil Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.dinky.shaded.paimon.schema;

import org.dinky.shaded.paimon.KeyValue;
import org.dinky.shaded.paimon.casting.CastExecutor;
import org.dinky.shaded.paimon.casting.CastExecutors;
import org.dinky.shaded.paimon.casting.CastFieldGetter;
import org.dinky.shaded.paimon.data.InternalRow;
import org.dinky.shaded.paimon.predicate.LeafPredicate;
import org.dinky.shaded.paimon.predicate.Predicate;
import org.dinky.shaded.paimon.predicate.PredicateReplaceVisitor;
import org.dinky.shaded.paimon.types.ArrayType;
import org.dinky.shaded.paimon.types.DataField;
import org.dinky.shaded.paimon.types.DataType;
import org.dinky.shaded.paimon.types.DataTypeFamily;
import org.dinky.shaded.paimon.types.MapType;
import org.dinky.shaded.paimon.types.MultisetType;
import org.dinky.shaded.paimon.types.RowType;
import org.dinky.shaded.paimon.utils.InternalRowUtils;
import org.dinky.shaded.paimon.utils.ProjectedRow;

import javax.annotation.Nullable;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;

import static org.dinky.shaded.paimon.utils.Preconditions.checkNotNull;
import static org.dinky.shaded.paimon.utils.Preconditions.checkState;

/** Utils for schema evolution. */
public class SchemaEvolutionUtil {

    private static final int NULL_FIELD_INDEX = -1;

    /**
     * Create index mapping from table fields to underlying data fields. For example, the table and
     * data fields are as follows
     *
     * 
    *
  • table fields: 1->c, 6->b, 3->a *
  • data fields: 1->a, 3->c *
* *

We can get the index mapping [0, -1, 1], in which 0 is the index of table field 1->c in * data fields, 1 is the index of 6->b in data fields and 1 is the index of 3->a in data fields. * *

/// TODO should support nest index mapping when nest schema evolution is supported. * * @param tableFields the fields of table * @param dataFields the fields of underlying data * @return the index mapping */ @Nullable public static int[] createIndexMapping( List tableFields, List dataFields) { int[] indexMapping = new int[tableFields.size()]; Map fieldIdToIndex = new HashMap<>(); for (int i = 0; i < dataFields.size(); i++) { fieldIdToIndex.put(dataFields.get(i).id(), i); } for (int i = 0; i < tableFields.size(); i++) { int fieldId = tableFields.get(i).id(); Integer dataFieldIndex = fieldIdToIndex.get(fieldId); if (dataFieldIndex != null) { indexMapping[i] = dataFieldIndex; } else { indexMapping[i] = NULL_FIELD_INDEX; } } for (int i = 0; i < indexMapping.length; i++) { if (indexMapping[i] != i) { return indexMapping; } } return null; } /** * Create index mapping from table projection to underlying data projection. For example, the * table and data fields are as follows * *

    *
  • table fields: 1->c, 3->a, 4->e, 5->d, 6->b *
  • data fields: 1->a, 2->b, 3->c, 4->d *
* *

The table and data top projections are as follows * *

    *
  • table projection: [0, 4, 1] *
  • data projection: [0, 2] *
* *

We can first get fields list for table and data projections from their fields as follows * *

    *
  • table projection field list: [1->c, 6->b, 3->a] *
  • data projection field list: [1->a, 3->c] *
* *

Then create index mapping based on the fields list and create cast mapping based on index * mapping. * *

/// TODO should support nest index mapping when nest schema evolution is supported. * * @param tableProjection the table projection * @param tableFields the fields in table * @param dataProjection the underlying data projection * @param dataFields the fields in underlying data * @return the index mapping */ public static IndexCastMapping createIndexCastMapping( int[] tableProjection, List tableFields, int[] dataProjection, List dataFields) { List tableProjectFields = projectDataFields(tableProjection, tableFields); List dataProjectFields = projectDataFields(dataProjection, dataFields); int[] indexMapping = createIndexMapping(tableProjectFields, dataProjectFields); CastFieldGetter[] castMapping = createCastFieldGetterMapping(tableProjectFields, dataProjectFields, indexMapping); return new IndexCastMapping() { @Nullable @Override public int[] getIndexMapping() { return indexMapping; } @Nullable @Override public CastFieldGetter[] getCastMapping() { return castMapping; } }; } private static List projectDataFields(int[] projection, List dataFields) { List projectFields = new ArrayList<>(projection.length); for (int index : projection) { projectFields.add(dataFields.get(index)); } return projectFields; } /** * Create index mapping from table projection to data with key and value fields. We should first * create table and data fields with their key/value fields, then create index mapping with * their projections and fields. For example, the table and data projections and fields are as * follows * *

    *
  • Table key fields: 1->ka, 3->kb, 5->kc, 6->kd; value fields: 0->a, 2->d, 4->b; * projection: [0, 2, 3, 4, 5, 7] where 0 is 1->ka, 2 is 5->kc, 3 is 5->kc, 4/5 are seq * and kind, 7 is 2->d *
  • Data key fields: 1->kb, 5->ka; value fields: 2->aa, 4->f; projection: [0, 1, 2, 3, 4] * where 0 is 1->kb, 1 is 5->ka, 2/3 are seq and kind, 4 is 2->aa *
* *

First we will get max key id from table and data fields which is 6, then create table and * data fields on it * *

    *
  • Table fields: 1->ka, 3->kb, 5->kc, 6->kd, 7->seq, 8->kind, 9->a, 11->d, 13->b *
  • Data fields: 1->kb, 5->ka, 7->seq, 8->kind, 11->aa, 13->f *
* *

Finally we can create index mapping with table/data projections and fields, and create * cast mapping based on index mapping. * *

/// TODO should support nest index mapping when nest schema evolution is supported. * * @param tableProjection the table projection * @param tableKeyFields the table key fields * @param tableValueFields the table value fields * @param dataProjection the data projection * @param dataKeyFields the data key fields * @param dataValueFields the data value fields * @return the result index and cast mapping */ public static IndexCastMapping createIndexCastMapping( int[] tableProjection, List tableKeyFields, List tableValueFields, int[] dataProjection, List dataKeyFields, List dataValueFields) { int maxKeyId = Math.max( tableKeyFields.stream().mapToInt(DataField::id).max().orElse(0), dataKeyFields.stream().mapToInt(DataField::id).max().orElse(0)); List tableFields = KeyValue.createKeyValueFields(tableKeyFields, tableValueFields, maxKeyId); List dataFields = KeyValue.createKeyValueFields(dataKeyFields, dataValueFields, maxKeyId); return createIndexCastMapping(tableProjection, tableFields, dataProjection, dataFields); } /** * Create data projection from table projection. For example, the table and data fields are as * follows * *

    *
  • table fields: 1->c, 3->a, 4->e, 5->d, 6->b *
  • data fields: 1->a, 2->b, 3->c, 4->d *
* *

When we project 1->c, 6->b, 3->a from table fields, the table projection is [[0], [4], * [1]], in which 0 is the index of field 1->c, 4 is the index of field 6->b, 1 is the index of * field 3->a in table fields. We need to create data projection from [[0], [4], [1]] as * follows: * *

    *
  • Get field id of each index in table projection from table fields *
  • Get index of each field above from data fields *
* *

The we can create table projection as follows: [[0], [-1], [2]], in which 0, -1 and 2 are * the index of fields [1->c, 6->b, 3->a] in data fields. When we project column from underlying * data, we need to specify the field index and name. It is difficult to assign a proper field * id and name for 6->b in data projection and add it to data fields, and we can't use 6->b * directly because the field index of b in underlying is 2. We can remove the -1 field index in * data projection, then the result data projection is: [[0], [2]]. * *

We create {@link InternalRow} for 1->a, 3->c after projecting them from underlying data, * then create {@link ProjectedRow} with a index mapping and return null for 6->b in table * fields. * * @param tableFields the fields of table * @param dataFields the fields of underlying data * @param tableProjection the projection of table * @return the projection of data */ public static int[][] createDataProjection( List tableFields, List dataFields, int[][] tableProjection) { List dataFieldIdList = dataFields.stream().map(DataField::id).collect(Collectors.toList()); return Arrays.stream(tableProjection) .map(p -> Arrays.copyOf(p, p.length)) .peek( p -> { int fieldId = tableFields.get(p[0]).id(); p[0] = dataFieldIdList.indexOf(fieldId); }) .filter(p -> p[0] >= 0) .toArray(int[][]::new); } /** * Create predicate list from data fields. We will visit all predicate in filters, reset it's * field index, name and type, and ignore predicate if the field is not exist. * * @param tableFields the table fields * @param dataFields the underlying data fields * @param filters the filters * @return the data filters */ @Nullable public static List createDataFilters( List tableFields, List dataFields, List filters) { if (filters == null) { return null; } Map nameToTableFields = tableFields.stream().collect(Collectors.toMap(DataField::name, f -> f)); LinkedHashMap idToDataFields = new LinkedHashMap<>(); dataFields.forEach(f -> idToDataFields.put(f.id(), f)); List dataFilters = new ArrayList<>(filters.size()); PredicateReplaceVisitor visitor = predicate -> { DataField tableField = checkNotNull( nameToTableFields.get(predicate.fieldName()), String.format("Find no field %s", predicate.fieldName())); DataField dataField = idToDataFields.get(tableField.id()); if (dataField == null) { return Optional.empty(); } DataType dataValueType = dataField.type().copy(true); DataType predicateType = predicate.type().copy(true); CastExecutor castExecutor = dataValueType.equals(predicateType) ? null : (CastExecutor) CastExecutors.resolve( predicate.type(), dataField.type()); // Convert value from predicate type to underlying data type which may lose // information, for example, convert double value to int. But it doesn't matter // because it just for predicate push down and the data will be filtered // correctly after reading. List literals = predicate.literals().stream() .map(v -> castExecutor == null ? v : castExecutor.cast(v)) .collect(Collectors.toList()); return Optional.of( new LeafPredicate( predicate.function(), dataField.type(), indexOf(dataField, idToDataFields), dataField.name(), literals)); }; for (Predicate predicate : filters) { predicate.visit(visitor).ifPresent(dataFilters::add); } return dataFilters; } private static int indexOf(DataField dataField, LinkedHashMap dataFields) { int index = 0; for (Map.Entry entry : dataFields.entrySet()) { if (dataField.id() == entry.getKey()) { return index; } index++; } throw new IllegalArgumentException( String.format("Can't find data field %s", dataField.name())); } /** * Create converter mapping from table fields to underlying data fields. For example, the table * and data fields are as follows * *
    *
  • table fields: 1->c INT, 6->b STRING, 3->a BIGINT *
  • data fields: 1->a BIGINT, 3->c DOUBLE *
* *

We can get the column types (1->a BIGINT), (3->c DOUBLE) from data fields for (1->c INT) * and (3->a BIGINT) in table fields through index mapping [0, -1, 1], then compare the data * type and create converter mapping. * *

/// TODO should support nest index mapping when nest schema evolution is supported. * * @param tableFields the fields of table * @param dataFields the fields of underlying data * @param indexMapping the index mapping from table fields to data fields * @return the index mapping */ @Nullable public static CastExecutor[] createConvertMapping( List tableFields, List dataFields, int[] indexMapping) { CastExecutor[] converterMapping = new CastExecutor[tableFields.size()]; boolean castExist = false; for (int i = 0; i < tableFields.size(); i++) { int dataIndex = indexMapping == null ? i : indexMapping[i]; if (dataIndex < 0) { converterMapping[i] = CastExecutors.identityCastExecutor(); } else { DataField tableField = tableFields.get(i); DataField dataField = dataFields.get(dataIndex); if (dataField.type().equalsIgnoreNullable(tableField.type())) { converterMapping[i] = CastExecutors.identityCastExecutor(); } else { // TODO support column type evolution in nested type checkState( !tableField.type().is(DataTypeFamily.CONSTRUCTED), "Only support column type evolution in atomic data type."); converterMapping[i] = checkNotNull( CastExecutors.resolve(dataField.type(), tableField.type())); castExist = true; } } } return castExist ? converterMapping : null; } /** * Create getter and casting mapping from table fields to underlying data fields with given * index mapping. For example, the table and data fields are as follows * *

    *
  • table fields: 1->c INT, 6->b STRING, 3->a BIGINT *
  • data fields: 1->a BIGINT, 3->c DOUBLE *
* *

We can get the column types (1->a BIGINT), (3->c DOUBLE) from data fields for (1->c INT) * and (3->a BIGINT) in table fields through index mapping [0, -1, 1], then compare the data * type and create getter and casting mapping. * *

/// TODO should support nest index mapping when nest schema evolution is supported. * * @param tableFields the fields of table * @param dataFields the fields of underlying data * @param indexMapping the index mapping from table fields to data fields * @return the getter and casting mapping */ private static CastFieldGetter[] createCastFieldGetterMapping( List tableFields, List dataFields, int[] indexMapping) { CastFieldGetter[] converterMapping = new CastFieldGetter[tableFields.size()]; boolean castExist = false; for (int i = 0; i < tableFields.size(); i++) { int dataIndex = indexMapping == null ? i : indexMapping[i]; if (dataIndex < 0) { converterMapping[i] = new CastFieldGetter(row -> null, CastExecutors.identityCastExecutor()); } else { DataField tableField = tableFields.get(i); DataField dataField = dataFields.get(dataIndex); if (dataField.type().equalsIgnoreNullable(tableField.type())) { // Create getter with index i and projected row data will convert to underlying // data converterMapping[i] = new CastFieldGetter( InternalRowUtils.createNullCheckingFieldGetter( dataField.type(), i), CastExecutors.identityCastExecutor()); } else { // TODO support column type evolution in nested type checkState( !(tableField.type() instanceof MapType || dataField.type() instanceof ArrayType || dataField.type() instanceof MultisetType || dataField.type() instanceof RowType), "Only support column type evolution in atomic data type."); // Create getter with index i and projected row data will convert to underlying // data converterMapping[i] = new CastFieldGetter( InternalRowUtils.createNullCheckingFieldGetter( dataField.type(), i), checkNotNull( CastExecutors.resolve( dataField.type(), tableField.type()))); castExist = true; } } } return castExist ? converterMapping : null; } }