org.apache.iceberg.connect.data.SchemaUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-kafka-connect Show documentation
A table format for huge analytic datasets
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.connect.data;

import java.math.BigDecimal;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;
import java.time.OffsetDateTime;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Table;
import org.apache.iceberg.UpdateSchema;
import org.apache.iceberg.connect.IcebergSinkConfig;
import org.apache.iceberg.connect.data.SchemaUpdate.AddColumn;
import org.apache.iceberg.connect.data.SchemaUpdate.MakeOptional;
import org.apache.iceberg.connect.data.SchemaUpdate.UpdateType;
import org.apache.iceberg.relocated.com.google.common.base.Splitter;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Type.PrimitiveType;
import org.apache.iceberg.types.Type.TypeID;
import org.apache.iceberg.types.Types.BinaryType;
import org.apache.iceberg.types.Types.BooleanType;
import org.apache.iceberg.types.Types.DateType;
import org.apache.iceberg.types.Types.DecimalType;
import org.apache.iceberg.types.Types.DoubleType;
import org.apache.iceberg.types.Types.FloatType;
import org.apache.iceberg.types.Types.IntegerType;
import org.apache.iceberg.types.Types.ListType;
import org.apache.iceberg.types.Types.LongType;
import org.apache.iceberg.types.Types.MapType;
import org.apache.iceberg.types.Types.NestedField;
import org.apache.iceberg.types.Types.StringType;
import org.apache.iceberg.types.Types.StructType;
import org.apache.iceberg.types.Types.TimeType;
import org.apache.iceberg.types.Types.TimestampType;
import org.apache.iceberg.util.Pair;
import org.apache.iceberg.util.Tasks;
import org.apache.kafka.connect.data.Date;
import org.apache.kafka.connect.data.Decimal;
import org.apache.kafka.connect.data.Schema;
import org.apache.kafka.connect.data.Time;
import org.apache.kafka.connect.data.Timestamp;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

class SchemaUtils {

  private static final Logger LOG = LoggerFactory.getLogger(SchemaUtils.class);

  private static final Pattern TRANSFORM_REGEX = Pattern.compile("(\\w+)\\((.+)\\)");

  static PrimitiveType needsDataTypeUpdate(Type currentIcebergType, Schema valueSchema) {
    if (currentIcebergType.typeId() == TypeID.FLOAT && valueSchema.type() == Schema.Type.FLOAT64) {
      return DoubleType.get();
    }
    if (currentIcebergType.typeId() == TypeID.INTEGER && valueSchema.type() == Schema.Type.INT64) {
      return LongType.get();
    }
    return null;
  }

  static void applySchemaUpdates(Table table, SchemaUpdate.Consumer updates) {
    if (updates == null || updates.empty()) {
      // no updates to apply
      return;
    }

    Tasks.range(1)
        .retry(IcebergSinkConfig.SCHEMA_UPDATE_RETRIES)
        .run(notUsed -> commitSchemaUpdates(table, updates));
  }

  private static void commitSchemaUpdates(Table table, SchemaUpdate.Consumer updates) {
    // get the latest schema in case another process updated it
    table.refresh();

    // filter out columns that have already been added
    List addColumns =
        updates.addColumns().stream()
            .filter(addCol -> !columnExists(table.schema(), addCol))
            .collect(Collectors.toList());

    // filter out columns that have the updated type
    List updateTypes =
        updates.updateTypes().stream()
            .filter(updateType -> !typeMatches(table.schema(), updateType))
            .collect(Collectors.toList());

    // filter out columns that have already been made optional
    List makeOptionals =
        updates.makeOptionals().stream()
            .filter(makeOptional -> !isOptional(table.schema(), makeOptional))
            .collect(Collectors.toList());

    if (addColumns.isEmpty() && updateTypes.isEmpty() && makeOptionals.isEmpty()) {
      // no updates to apply
      LOG.info("Schema for table {} already up-to-date", table.name());
      return;
    }

    // apply the updates
    UpdateSchema updateSchema = table.updateSchema();
    addColumns.forEach(
        update -> updateSchema.addColumn(update.parentName(), update.name(), update.type()));
    updateTypes.forEach(update -> updateSchema.updateColumn(update.name(), update.type()));
    makeOptionals.forEach(update -> updateSchema.makeColumnOptional(update.name()));
    updateSchema.commit();
    LOG.info("Schema for table {} updated with new columns", table.name());
  }

  private static boolean columnExists(org.apache.iceberg.Schema schema, AddColumn update) {
    return schema.findType(update.key()) != null;
  }

  private static boolean typeMatches(org.apache.iceberg.Schema schema, UpdateType update) {
    Type type = schema.findType(update.name());
    if (type == null) {
      throw new IllegalArgumentException("Invalid column: " + update.name());
    }
    return type.typeId() == update.type().typeId();
  }

  private static boolean isOptional(org.apache.iceberg.Schema schema, MakeOptional update) {
    NestedField field = schema.findField(update.name());
    if (field == null) {
      throw new IllegalArgumentException("Invalid column: " + update.name());
    }
    return field.isOptional();
  }

  static PartitionSpec createPartitionSpec(
      org.apache.iceberg.Schema schema, List partitionBy) {
    if (partitionBy.isEmpty()) {
      return PartitionSpec.unpartitioned();
    }

    PartitionSpec.Builder specBuilder = PartitionSpec.builderFor(schema);
    partitionBy.forEach(
        partitionField -> {
          Matcher matcher = TRANSFORM_REGEX.matcher(partitionField);
          if (matcher.matches()) {
            String transform = matcher.group(1);
            switch (transform) {
              case "year":
              case "years":
                specBuilder.year(matcher.group(2));
                break;
              case "month":
              case "months":
                specBuilder.month(matcher.group(2));
                break;
              case "day":
              case "days":
                specBuilder.day(matcher.group(2));
                break;
              case "hour":
              case "hours":
                specBuilder.hour(matcher.group(2));
                break;
              case "bucket":
                {
                  Pair args = transformArgPair(matcher.group(2));
                  specBuilder.bucket(args.first(), args.second());
                  break;
                }
              case "truncate":
                {
                  Pair args = transformArgPair(matcher.group(2));
                  specBuilder.truncate(args.first(), args.second());
                  break;
                }
              default:
                throw new UnsupportedOperationException("Unsupported transform: " + transform);
            }
          } else {
            specBuilder.identity(partitionField);
          }
        });
    return specBuilder.build();
  }

  private static Pair transformArgPair(String argsStr) {
    List parts = Splitter.on(',').splitToList(argsStr);
    if (parts.size() != 2) {
      throw new IllegalArgumentException("Invalid argument " + argsStr + ", should have 2 parts");
    }
    return Pair.of(parts.get(0).trim(), Integer.parseInt(parts.get(1).trim()));
  }

  static Type toIcebergType(Schema valueSchema, IcebergSinkConfig config) {
    return new SchemaGenerator(config).toIcebergType(valueSchema);
  }

  static Type inferIcebergType(Object value, IcebergSinkConfig config) {
    return new SchemaGenerator(config).inferIcebergType(value);
  }

  static class SchemaGenerator {

    private int fieldId = 1;
    private final IcebergSinkConfig config;

    SchemaGenerator(IcebergSinkConfig config) {
      this.config = config;
    }

    @SuppressWarnings("checkstyle:CyclomaticComplexity")
    Type toIcebergType(Schema valueSchema) {
      switch (valueSchema.type()) {
        case BOOLEAN:
          return BooleanType.get();
        case BYTES:
          if (Decimal.LOGICAL_NAME.equals(valueSchema.name())) {
            int scale = Integer.parseInt(valueSchema.parameters().get(Decimal.SCALE_FIELD));
            return DecimalType.of(38, scale);
          }
          return BinaryType.get();
        case INT8:
        case INT16:
          return IntegerType.get();
        case INT32:
          if (Date.LOGICAL_NAME.equals(valueSchema.name())) {
            return DateType.get();
          } else if (Time.LOGICAL_NAME.equals(valueSchema.name())) {
            return TimeType.get();
          }
          return IntegerType.get();
        case INT64:
          if (Timestamp.LOGICAL_NAME.equals(valueSchema.name())) {
            return TimestampType.withZone();
          }
          return LongType.get();
        case FLOAT32:
          return FloatType.get();
        case FLOAT64:
          return DoubleType.get();
        case ARRAY:
          Type elementType = toIcebergType(valueSchema.valueSchema());
          if (config.schemaForceOptional() || valueSchema.valueSchema().isOptional()) {
            return ListType.ofOptional(nextId(), elementType);
          } else {
            return ListType.ofRequired(nextId(), elementType);
          }
        case MAP:
          Type keyType = toIcebergType(valueSchema.keySchema());
          Type valueType = toIcebergType(valueSchema.valueSchema());
          if (config.schemaForceOptional() || valueSchema.valueSchema().isOptional()) {
            return MapType.ofOptional(nextId(), nextId(), keyType, valueType);
          } else {
            return MapType.ofRequired(nextId(), nextId(), keyType, valueType);
          }
        case STRUCT:
          List structFields =
              valueSchema.fields().stream()
                  .map(
                      field ->
                          NestedField.of(
                              nextId(),
                              config.schemaForceOptional() || field.schema().isOptional(),
                              field.name(),
                              toIcebergType(field.schema())))
                  .collect(Collectors.toList());
          return StructType.of(structFields);
        case STRING:
        default:
          return StringType.get();
      }
    }

    @SuppressWarnings("checkstyle:CyclomaticComplexity")
    Type inferIcebergType(Object value) {
      if (value == null) {
        return null;
      } else if (value instanceof String) {
        return StringType.get();
      } else if (value instanceof Boolean) {
        return BooleanType.get();
      } else if (value instanceof BigDecimal) {
        BigDecimal bigDecimal = (BigDecimal) value;
        return DecimalType.of(bigDecimal.precision(), bigDecimal.scale());
      } else if (value instanceof Integer || value instanceof Long) {
        return LongType.get();
      } else if (value instanceof Float || value instanceof Double) {
        return DoubleType.get();
      } else if (value instanceof LocalDate) {
        return DateType.get();
      } else if (value instanceof LocalTime) {
        return TimeType.get();
      } else if (value instanceof java.util.Date || value instanceof OffsetDateTime) {
        return TimestampType.withZone();
      } else if (value instanceof LocalDateTime) {
        return TimestampType.withoutZone();
      } else if (value instanceof List) {
        List list = (List) value;
        if (list.isEmpty()) {
          return null;
        }
        Type elementType = inferIcebergType(list.get(0));
        return elementType == null ? null : ListType.ofOptional(nextId(), elementType);
      } else if (value instanceof Map) {
        Map map = (Map) value;
        List structFields =
            map.entrySet().stream()
                .filter(entry -> entry.getKey() != null && entry.getValue() != null)
                .map(
                    entry -> {
                      Type valueType = inferIcebergType(entry.getValue());
                      return valueType == null
                          ? null
                          : NestedField.optional(nextId(), entry.getKey().toString(), valueType);
                    })
                .filter(Objects::nonNull)
                .collect(Collectors.toList());
        if (structFields.isEmpty()) {
          return null;
        }
        return StructType.of(structFields);
      } else {
        return null;
      }
    }

    private int nextId() {
      return fieldId++;
    }
  }

  private SchemaUtils() {}
}