org.dinky.shaded.paimon.schema.SchemaEvolutionUtil Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dinky.shaded.paimon.schema;
import org.dinky.shaded.paimon.KeyValue;
import org.dinky.shaded.paimon.casting.CastExecutor;
import org.dinky.shaded.paimon.casting.CastExecutors;
import org.dinky.shaded.paimon.casting.CastFieldGetter;
import org.dinky.shaded.paimon.data.InternalRow;
import org.dinky.shaded.paimon.predicate.LeafPredicate;
import org.dinky.shaded.paimon.predicate.Predicate;
import org.dinky.shaded.paimon.predicate.PredicateReplaceVisitor;
import org.dinky.shaded.paimon.types.ArrayType;
import org.dinky.shaded.paimon.types.DataField;
import org.dinky.shaded.paimon.types.DataType;
import org.dinky.shaded.paimon.types.DataTypeFamily;
import org.dinky.shaded.paimon.types.MapType;
import org.dinky.shaded.paimon.types.MultisetType;
import org.dinky.shaded.paimon.types.RowType;
import org.dinky.shaded.paimon.utils.InternalRowUtils;
import org.dinky.shaded.paimon.utils.ProjectedRow;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import static org.dinky.shaded.paimon.utils.Preconditions.checkNotNull;
import static org.dinky.shaded.paimon.utils.Preconditions.checkState;
/** Utils for schema evolution. */
public class SchemaEvolutionUtil {
private static final int NULL_FIELD_INDEX = -1;
/**
* Create index mapping from table fields to underlying data fields. For example, the table and
* data fields are as follows
*
*
* - table fields: 1->c, 6->b, 3->a
*
- data fields: 1->a, 3->c
*
*
* We can get the index mapping [0, -1, 1], in which 0 is the index of table field 1->c in
* data fields, 1 is the index of 6->b in data fields and 1 is the index of 3->a in data fields.
*
*
/// TODO should support nest index mapping when nest schema evolution is supported.
*
* @param tableFields the fields of table
* @param dataFields the fields of underlying data
* @return the index mapping
*/
@Nullable
public static int[] createIndexMapping(
List tableFields, List dataFields) {
int[] indexMapping = new int[tableFields.size()];
Map fieldIdToIndex = new HashMap<>();
for (int i = 0; i < dataFields.size(); i++) {
fieldIdToIndex.put(dataFields.get(i).id(), i);
}
for (int i = 0; i < tableFields.size(); i++) {
int fieldId = tableFields.get(i).id();
Integer dataFieldIndex = fieldIdToIndex.get(fieldId);
if (dataFieldIndex != null) {
indexMapping[i] = dataFieldIndex;
} else {
indexMapping[i] = NULL_FIELD_INDEX;
}
}
for (int i = 0; i < indexMapping.length; i++) {
if (indexMapping[i] != i) {
return indexMapping;
}
}
return null;
}
/**
* Create index mapping from table projection to underlying data projection. For example, the
* table and data fields are as follows
*
*
* - table fields: 1->c, 3->a, 4->e, 5->d, 6->b
*
- data fields: 1->a, 2->b, 3->c, 4->d
*
*
* The table and data top projections are as follows
*
*
* - table projection: [0, 4, 1]
*
- data projection: [0, 2]
*
*
* We can first get fields list for table and data projections from their fields as follows
*
*
* - table projection field list: [1->c, 6->b, 3->a]
*
- data projection field list: [1->a, 3->c]
*
*
* Then create index mapping based on the fields list and create cast mapping based on index
* mapping.
*
*
/// TODO should support nest index mapping when nest schema evolution is supported.
*
* @param tableProjection the table projection
* @param tableFields the fields in table
* @param dataProjection the underlying data projection
* @param dataFields the fields in underlying data
* @return the index mapping
*/
public static IndexCastMapping createIndexCastMapping(
int[] tableProjection,
List tableFields,
int[] dataProjection,
List dataFields) {
List tableProjectFields = projectDataFields(tableProjection, tableFields);
List dataProjectFields = projectDataFields(dataProjection, dataFields);
int[] indexMapping = createIndexMapping(tableProjectFields, dataProjectFields);
CastFieldGetter[] castMapping =
createCastFieldGetterMapping(tableProjectFields, dataProjectFields, indexMapping);
return new IndexCastMapping() {
@Nullable
@Override
public int[] getIndexMapping() {
return indexMapping;
}
@Nullable
@Override
public CastFieldGetter[] getCastMapping() {
return castMapping;
}
};
}
private static List projectDataFields(int[] projection, List dataFields) {
List projectFields = new ArrayList<>(projection.length);
for (int index : projection) {
projectFields.add(dataFields.get(index));
}
return projectFields;
}
/**
* Create index mapping from table projection to data with key and value fields. We should first
* create table and data fields with their key/value fields, then create index mapping with
* their projections and fields. For example, the table and data projections and fields are as
* follows
*
*
* - Table key fields: 1->ka, 3->kb, 5->kc, 6->kd; value fields: 0->a, 2->d, 4->b;
* projection: [0, 2, 3, 4, 5, 7] where 0 is 1->ka, 2 is 5->kc, 3 is 5->kc, 4/5 are seq
* and kind, 7 is 2->d
*
- Data key fields: 1->kb, 5->ka; value fields: 2->aa, 4->f; projection: [0, 1, 2, 3, 4]
* where 0 is 1->kb, 1 is 5->ka, 2/3 are seq and kind, 4 is 2->aa
*
*
* First we will get max key id from table and data fields which is 6, then create table and
* data fields on it
*
*
* - Table fields: 1->ka, 3->kb, 5->kc, 6->kd, 7->seq, 8->kind, 9->a, 11->d, 13->b
*
- Data fields: 1->kb, 5->ka, 7->seq, 8->kind, 11->aa, 13->f
*
*
* Finally we can create index mapping with table/data projections and fields, and create
* cast mapping based on index mapping.
*
*
/// TODO should support nest index mapping when nest schema evolution is supported.
*
* @param tableProjection the table projection
* @param tableKeyFields the table key fields
* @param tableValueFields the table value fields
* @param dataProjection the data projection
* @param dataKeyFields the data key fields
* @param dataValueFields the data value fields
* @return the result index and cast mapping
*/
public static IndexCastMapping createIndexCastMapping(
int[] tableProjection,
List tableKeyFields,
List tableValueFields,
int[] dataProjection,
List dataKeyFields,
List dataValueFields) {
int maxKeyId =
Math.max(
tableKeyFields.stream().mapToInt(DataField::id).max().orElse(0),
dataKeyFields.stream().mapToInt(DataField::id).max().orElse(0));
List tableFields =
KeyValue.createKeyValueFields(tableKeyFields, tableValueFields, maxKeyId);
List dataFields =
KeyValue.createKeyValueFields(dataKeyFields, dataValueFields, maxKeyId);
return createIndexCastMapping(tableProjection, tableFields, dataProjection, dataFields);
}
/**
* Create data projection from table projection. For example, the table and data fields are as
* follows
*
*
* - table fields: 1->c, 3->a, 4->e, 5->d, 6->b
*
- data fields: 1->a, 2->b, 3->c, 4->d
*
*
* When we project 1->c, 6->b, 3->a from table fields, the table projection is [[0], [4],
* [1]], in which 0 is the index of field 1->c, 4 is the index of field 6->b, 1 is the index of
* field 3->a in table fields. We need to create data projection from [[0], [4], [1]] as
* follows:
*
*
* - Get field id of each index in table projection from table fields
*
- Get index of each field above from data fields
*
*
* The we can create table projection as follows: [[0], [-1], [2]], in which 0, -1 and 2 are
* the index of fields [1->c, 6->b, 3->a] in data fields. When we project column from underlying
* data, we need to specify the field index and name. It is difficult to assign a proper field
* id and name for 6->b in data projection and add it to data fields, and we can't use 6->b
* directly because the field index of b in underlying is 2. We can remove the -1 field index in
* data projection, then the result data projection is: [[0], [2]].
*
*
We create {@link InternalRow} for 1->a, 3->c after projecting them from underlying data,
* then create {@link ProjectedRow} with a index mapping and return null for 6->b in table
* fields.
*
* @param tableFields the fields of table
* @param dataFields the fields of underlying data
* @param tableProjection the projection of table
* @return the projection of data
*/
public static int[][] createDataProjection(
List tableFields, List dataFields, int[][] tableProjection) {
List dataFieldIdList =
dataFields.stream().map(DataField::id).collect(Collectors.toList());
return Arrays.stream(tableProjection)
.map(p -> Arrays.copyOf(p, p.length))
.peek(
p -> {
int fieldId = tableFields.get(p[0]).id();
p[0] = dataFieldIdList.indexOf(fieldId);
})
.filter(p -> p[0] >= 0)
.toArray(int[][]::new);
}
/**
* Create predicate list from data fields. We will visit all predicate in filters, reset it's
* field index, name and type, and ignore predicate if the field is not exist.
*
* @param tableFields the table fields
* @param dataFields the underlying data fields
* @param filters the filters
* @return the data filters
*/
@Nullable
public static List createDataFilters(
List tableFields, List dataFields, List filters) {
if (filters == null) {
return null;
}
Map nameToTableFields =
tableFields.stream().collect(Collectors.toMap(DataField::name, f -> f));
LinkedHashMap idToDataFields = new LinkedHashMap<>();
dataFields.forEach(f -> idToDataFields.put(f.id(), f));
List dataFilters = new ArrayList<>(filters.size());
PredicateReplaceVisitor visitor =
predicate -> {
DataField tableField =
checkNotNull(
nameToTableFields.get(predicate.fieldName()),
String.format("Find no field %s", predicate.fieldName()));
DataField dataField = idToDataFields.get(tableField.id());
if (dataField == null) {
return Optional.empty();
}
DataType dataValueType = dataField.type().copy(true);
DataType predicateType = predicate.type().copy(true);
CastExecutor