![JAR search and dependency download from the Maven repository](/logo.png)
org.dinky.shaded.paimon.schema.SchemaValidation Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dinky.shaded.paimon.schema;
import org.dinky.shaded.paimon.CoreOptions;
import org.dinky.shaded.paimon.CoreOptions.ChangelogProducer;
import org.dinky.shaded.paimon.casting.CastExecutor;
import org.dinky.shaded.paimon.casting.CastExecutors;
import org.dinky.shaded.paimon.data.BinaryString;
import org.dinky.shaded.paimon.format.FileFormat;
import org.dinky.shaded.paimon.options.ConfigOption;
import org.dinky.shaded.paimon.options.Options;
import org.dinky.shaded.paimon.types.ArrayType;
import org.dinky.shaded.paimon.types.DataField;
import org.dinky.shaded.paimon.types.DataType;
import org.dinky.shaded.paimon.types.MapType;
import org.dinky.shaded.paimon.types.MultisetType;
import org.dinky.shaded.paimon.types.RowType;
import org.dinky.shaded.paimon.types.VarCharType;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import static org.dinky.shaded.paimon.CoreOptions.BUCKET_KEY;
import static org.dinky.shaded.paimon.CoreOptions.CHANGELOG_PRODUCER;
import static org.dinky.shaded.paimon.CoreOptions.FIELDS_PREFIX;
import static org.dinky.shaded.paimon.CoreOptions.FULL_COMPACTION_DELTA_COMMITS;
import static org.dinky.shaded.paimon.CoreOptions.INCREMENTAL_BETWEEN;
import static org.dinky.shaded.paimon.CoreOptions.INCREMENTAL_BETWEEN_TIMESTAMP;
import static org.dinky.shaded.paimon.CoreOptions.SCAN_MODE;
import static org.dinky.shaded.paimon.CoreOptions.SCAN_SNAPSHOT_ID;
import static org.dinky.shaded.paimon.CoreOptions.SCAN_TAG_NAME;
import static org.dinky.shaded.paimon.CoreOptions.SCAN_TIMESTAMP_MILLIS;
import static org.dinky.shaded.paimon.CoreOptions.SNAPSHOT_NUM_RETAINED_MAX;
import static org.dinky.shaded.paimon.CoreOptions.SNAPSHOT_NUM_RETAINED_MIN;
import static org.dinky.shaded.paimon.CoreOptions.STREAMING_READ_OVERWRITE;
import static org.dinky.shaded.paimon.mergetree.compact.PartialUpdateMergeFunction.SEQUENCE_GROUP;
import static org.dinky.shaded.paimon.schema.SystemColumns.KEY_FIELD_PREFIX;
import static org.dinky.shaded.paimon.schema.SystemColumns.SYSTEM_FIELD_NAMES;
import static org.dinky.shaded.paimon.utils.Preconditions.checkArgument;
import static org.dinky.shaded.paimon.utils.Preconditions.checkState;
/** Validation utils for {@link TableSchema}. */
public class SchemaValidation {
public static final List> PRIMARY_KEY_UNSUPPORTED_LOGICAL_TYPES =
Arrays.asList(MapType.class, ArrayType.class, RowType.class, MultisetType.class);
/**
* Validate the {@link TableSchema} and {@link CoreOptions}.
*
* TODO validate all items in schema and all keys in options.
*
* @param schema the schema to be validated
*/
public static void validateTableSchema(TableSchema schema) {
validatePrimaryKeysType(schema.fields(), schema.primaryKeys());
CoreOptions options = new CoreOptions(schema.options());
validateDefaultValues(schema);
validateStartupMode(options);
validateSequenceGroup(schema, options);
ChangelogProducer changelogProducer = options.changelogProducer();
if (schema.primaryKeys().isEmpty() && changelogProducer != ChangelogProducer.NONE) {
throw new UnsupportedOperationException(
String.format(
"Can not set %s on table without primary keys, please define primary keys.",
CHANGELOG_PRODUCER.key()));
}
if (options.streamingReadOverwrite()
&& (changelogProducer == ChangelogProducer.FULL_COMPACTION
|| changelogProducer == ChangelogProducer.LOOKUP)) {
throw new UnsupportedOperationException(
String.format(
"Cannot set %s to true when changelog producer is %s or %s because it will read duplicated changes.",
STREAMING_READ_OVERWRITE.key(),
ChangelogProducer.FULL_COMPACTION,
ChangelogProducer.LOOKUP));
}
checkArgument(
options.snapshotNumRetainMin() > 0,
SNAPSHOT_NUM_RETAINED_MIN.key() + " should be at least 1");
checkArgument(
options.snapshotNumRetainMin() <= options.snapshotNumRetainMax(),
SNAPSHOT_NUM_RETAINED_MIN.key()
+ " should not be larger than "
+ SNAPSHOT_NUM_RETAINED_MAX.key());
// Get the format type here which will try to convert string value to {@Code
// FileFormatType}. If the string value is illegal, an exception will be thrown.
CoreOptions.FileFormatType fileFormatType = options.formatType();
FileFormat fileFormat =
FileFormat.fromIdentifier(fileFormatType.name(), new Options(schema.options()));
fileFormat.validateDataFields(new RowType(schema.fields()));
// Check column names in schema
schema.fieldNames()
.forEach(
f -> {
checkState(
!SYSTEM_FIELD_NAMES.contains(f),
String.format(
"Field name[%s] in schema cannot be exist in %s",
f, SYSTEM_FIELD_NAMES));
checkState(
!f.startsWith(KEY_FIELD_PREFIX),
String.format(
"Field name[%s] in schema cannot start with [%s]",
f, KEY_FIELD_PREFIX));
});
if (options.bucket() == -1 && options.toMap().get(BUCKET_KEY.key()) != null) {
throw new RuntimeException(
"Cannot define 'bucket-key' in unaware or dynamic bucket mode.");
}
if (options.bucket() == -1
&& schema.primaryKeys().isEmpty()
&& options.toMap().get(FULL_COMPACTION_DELTA_COMMITS.key()) != null) {
throw new RuntimeException(
"AppendOnlyTable of unware or dynamic bucket does not support 'full-compaction.delta-commits'");
}
if (schema.primaryKeys().isEmpty() && options.streamingReadOverwrite()) {
throw new RuntimeException(
"Doesn't support streaming read the changes from overwrite when the primary keys are not defined.");
}
if (schema.options().containsKey(CoreOptions.PARTITION_EXPIRATION_TIME.key())) {
if (schema.partitionKeys().isEmpty()) {
throw new IllegalArgumentException(
"Can not set 'partition.expiration-time' for non-partitioned table.");
}
}
Optional sequenceField = options.sequenceField();
sequenceField.ifPresent(
field ->
checkArgument(
schema.fieldNames().contains(field),
"Nonexistent sequence field: '%s'",
field));
sequenceField.ifPresent(
field ->
checkArgument(
options.fieldAggFunc(field) == null,
"Should not define aggregation on sequence field: '%s'",
field));
CoreOptions.MergeEngine mergeEngine = options.mergeEngine();
if (mergeEngine == CoreOptions.MergeEngine.FIRST_ROW) {
if (sequenceField.isPresent()) {
throw new IllegalArgumentException(
"Do not support use sequence field on FIRST_MERGE merge engine");
}
if (changelogProducer != ChangelogProducer.LOOKUP) {
throw new IllegalArgumentException(
"Only support 'lookup' changelog-producer on FIRST_MERGE merge engine");
}
}
if (schema.crossPartitionUpdate()) {
if (options.bucket() != -1) {
throw new IllegalArgumentException(
String.format(
"You should use dynamic bucket (bucket = -1) mode in cross partition update case "
+ "(Primary key constraint %s not include all partition fields %s).",
schema.primaryKeys(), schema.partitionKeys()));
}
if (sequenceField.isPresent()) {
throw new IllegalArgumentException(
String.format(
"You can not use sequence.field in cross partition update case "
+ "(Primary key constraint %s not include all partition fields %s).",
schema.primaryKeys(), schema.partitionKeys()));
}
}
}
private static void validatePrimaryKeysType(List fields, List primaryKeys) {
if (!primaryKeys.isEmpty()) {
Map rowFields = new HashMap<>();
for (DataField rowField : fields) {
rowFields.put(rowField.name(), rowField);
}
for (String primaryKeyName : primaryKeys) {
DataField rowField = rowFields.get(primaryKeyName);
DataType dataType = rowField.type();
if (PRIMARY_KEY_UNSUPPORTED_LOGICAL_TYPES.stream()
.anyMatch(c -> c.isInstance(dataType))) {
throw new UnsupportedOperationException(
String.format(
"The type %s in primary key field %s is unsupported",
dataType.getClass().getSimpleName(), primaryKeyName));
}
}
}
}
private static void validateStartupMode(CoreOptions options) {
if (options.startupMode() == CoreOptions.StartupMode.FROM_TIMESTAMP) {
checkOptionExistInMode(
options, SCAN_TIMESTAMP_MILLIS, CoreOptions.StartupMode.FROM_TIMESTAMP);
checkOptionsConflict(
options,
Arrays.asList(
SCAN_SNAPSHOT_ID,
SCAN_TAG_NAME,
INCREMENTAL_BETWEEN_TIMESTAMP,
INCREMENTAL_BETWEEN),
Collections.singletonList(SCAN_TIMESTAMP_MILLIS));
} else if (options.startupMode() == CoreOptions.StartupMode.FROM_SNAPSHOT) {
checkExactOneOptionExistInMode(
options, options.startupMode(), SCAN_SNAPSHOT_ID, SCAN_TAG_NAME);
checkOptionsConflict(
options,
Arrays.asList(
SCAN_TIMESTAMP_MILLIS,
INCREMENTAL_BETWEEN_TIMESTAMP,
INCREMENTAL_BETWEEN),
Arrays.asList(SCAN_SNAPSHOT_ID, SCAN_TAG_NAME));
} else if (options.startupMode() == CoreOptions.StartupMode.INCREMENTAL) {
checkExactOneOptionExistInMode(
options,
options.startupMode(),
INCREMENTAL_BETWEEN,
INCREMENTAL_BETWEEN_TIMESTAMP);
checkOptionsConflict(
options,
Arrays.asList(SCAN_SNAPSHOT_ID, SCAN_TIMESTAMP_MILLIS, SCAN_TAG_NAME),
Arrays.asList(INCREMENTAL_BETWEEN, INCREMENTAL_BETWEEN_TIMESTAMP));
} else if (options.startupMode() == CoreOptions.StartupMode.FROM_SNAPSHOT_FULL) {
checkOptionExistInMode(options, SCAN_SNAPSHOT_ID, options.startupMode());
checkOptionsConflict(
options,
Arrays.asList(
SCAN_TIMESTAMP_MILLIS,
SCAN_TAG_NAME,
INCREMENTAL_BETWEEN_TIMESTAMP,
INCREMENTAL_BETWEEN),
Collections.singletonList(SCAN_SNAPSHOT_ID));
} else {
checkOptionNotExistInMode(options, SCAN_TIMESTAMP_MILLIS, options.startupMode());
checkOptionNotExistInMode(options, SCAN_SNAPSHOT_ID, options.startupMode());
checkOptionNotExistInMode(options, SCAN_TAG_NAME, options.startupMode());
checkOptionNotExistInMode(
options, INCREMENTAL_BETWEEN_TIMESTAMP, options.startupMode());
checkOptionNotExistInMode(options, INCREMENTAL_BETWEEN, options.startupMode());
}
}
private static void checkOptionExistInMode(
CoreOptions options, ConfigOption> option, CoreOptions.StartupMode startupMode) {
checkArgument(
options.toConfiguration().contains(option),
String.format(
"%s can not be null when you use %s for %s",
option.key(), startupMode, SCAN_MODE.key()));
}
private static void checkOptionNotExistInMode(
CoreOptions options, ConfigOption> option, CoreOptions.StartupMode startupMode) {
checkArgument(
!options.toConfiguration().contains(option),
String.format(
"%s must be null when you use %s for %s",
option.key(), startupMode, SCAN_MODE.key()));
}
private static void checkExactOneOptionExistInMode(
CoreOptions options,
CoreOptions.StartupMode startupMode,
ConfigOption>... configOptions) {
checkArgument(
Arrays.stream(configOptions)
.filter(op -> options.toConfiguration().contains(op))
.count()
== 1,
String.format(
"must set only one key in [%s] when you use %s for %s",
concatConfigKeys(Arrays.asList(configOptions)),
startupMode,
SCAN_MODE.key()));
}
private static void checkOptionsConflict(
CoreOptions options,
List> illegalOptions,
List> legalOptions) {
for (ConfigOption> illegalOption : illegalOptions) {
checkArgument(
!options.toConfiguration().contains(illegalOption),
"[%s] must be null when you set [%s]",
illegalOption.key(),
concatConfigKeys(legalOptions));
}
}
private static String concatConfigKeys(List> configOptions) {
return configOptions.stream().map(ConfigOption::key).collect(Collectors.joining(","));
}
private static void validateSequenceGroup(TableSchema schema, CoreOptions options) {
Map> fields2Group = new HashMap<>();
for (Map.Entry entry : options.toMap().entrySet()) {
String k = entry.getKey();
String v = entry.getValue();
List fieldNames = schema.fieldNames();
if (k.startsWith(FIELDS_PREFIX) && k.endsWith(SEQUENCE_GROUP)) {
String sequenceFieldName =
k.substring(
FIELDS_PREFIX.length() + 1,
k.length() - SEQUENCE_GROUP.length() - 1);
if (!fieldNames.contains(sequenceFieldName)) {
throw new IllegalArgumentException(
String.format(
"The sequence field group: %s can not be found in table schema.",
sequenceFieldName));
}
for (String field : v.split(",")) {
if (!fieldNames.contains(field)) {
throw new IllegalArgumentException(
String.format("Field %s can not be found in table schema.", field));
}
Set group = fields2Group.computeIfAbsent(field, p -> new HashSet<>());
if (group.add(sequenceFieldName) && group.size() > 1) {
throw new IllegalArgumentException(
String.format(
"Field %s is defined repeatedly by multiple groups: %s.",
field, group));
}
}
}
}
Set illegalGroup =
fields2Group.values().stream()
.flatMap(Collection::stream)
.filter(g -> options.fieldAggFunc(g) != null)
.collect(Collectors.toSet());
if (!illegalGroup.isEmpty()) {
throw new IllegalArgumentException(
"Should not defined aggregation function on sequence group: " + illegalGroup);
}
}
private static void validateDefaultValues(TableSchema schema) {
CoreOptions coreOptions = new CoreOptions(schema.options());
Map defaultValues = coreOptions.getFieldDefaultValues();
if (!defaultValues.isEmpty()) {
List partitionKeys = schema.partitionKeys();
for (String partitionKey : partitionKeys) {
if (defaultValues.containsKey(partitionKey)) {
throw new IllegalArgumentException(
String.format(
"Partition key %s should not be assign default column.",
partitionKey));
}
}
List primaryKeys = schema.primaryKeys();
for (String primaryKey : primaryKeys) {
if (defaultValues.containsKey(primaryKey)) {
throw new IllegalArgumentException(
String.format(
"Primary key %s should not be assign default column.",
primaryKey));
}
}
List fields = schema.fields();
for (DataField field : fields) {
String defaultValueStr = defaultValues.get(field.name());
if (defaultValueStr == null) {
continue;
}
@SuppressWarnings("unchecked")
CastExecutor