All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.BaseUpdatePartitionSpec Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg;

import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.expressions.BoundReference;
import org.apache.iceberg.expressions.BoundTerm;
import org.apache.iceberg.expressions.BoundTransform;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.expressions.Term;
import org.apache.iceberg.expressions.UnboundTerm;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.transforms.PartitionSpecVisitor;
import org.apache.iceberg.transforms.Transform;
import org.apache.iceberg.transforms.Transforms;
import org.apache.iceberg.transforms.UnknownTransform;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.util.Pair;

class BaseUpdatePartitionSpec implements UpdatePartitionSpec {
  private final TableOperations ops;
  private final TableMetadata base;
  private final int formatVersion;
  private final PartitionSpec spec;
  private final Schema schema;
  private final Map nameToField;
  private final Map, PartitionField> transformToField;

  private final List adds = Lists.newArrayList();
  private final Map addedTimeFields = Maps.newHashMap();
  private final Map, PartitionField> transformToAddedField =
      Maps.newHashMap();
  private final Map nameToAddedField = Maps.newHashMap();
  private final Set deletes = Sets.newHashSet();
  private final Map renames = Maps.newHashMap();

  private boolean caseSensitive;
  private boolean setAsDefault;
  private int lastAssignedPartitionId;

  BaseUpdatePartitionSpec(TableOperations ops) {
    this.ops = ops;
    this.caseSensitive = true;
    this.setAsDefault = true;
    this.base = ops.current();
    this.formatVersion = base.formatVersion();
    this.spec = base.spec();
    this.schema = spec.schema();
    this.nameToField = indexSpecByName(spec);
    this.transformToField = indexSpecByTransform(spec);
    this.lastAssignedPartitionId = base.lastAssignedPartitionId();

    spec.fields().stream()
        .filter(field -> field.transform() instanceof UnknownTransform)
        .findAny()
        .ifPresent(
            field -> {
              throw new IllegalArgumentException(
                  "Cannot update partition spec with unknown transform: " + field);
            });
  }

  /** For testing only. */
  @VisibleForTesting
  BaseUpdatePartitionSpec(int formatVersion, PartitionSpec spec) {
    this(formatVersion, spec, spec.lastAssignedFieldId());
  }

  /** For testing only. */
  @VisibleForTesting
  BaseUpdatePartitionSpec(int formatVersion, PartitionSpec spec, int lastAssignedPartitionId) {
    this.ops = null;
    this.base = null;
    this.formatVersion = formatVersion;
    this.caseSensitive = true;
    this.setAsDefault = true;
    this.spec = spec;
    this.schema = spec.schema();
    this.nameToField = indexSpecByName(spec);
    this.transformToField = indexSpecByTransform(spec);
    this.lastAssignedPartitionId = lastAssignedPartitionId;
  }

  private int assignFieldId() {
    this.lastAssignedPartitionId += 1;
    return lastAssignedPartitionId;
  }

  /**
   * In V2 it searches for a similar partition field in historical partition specs. Tries to match
   * on source field ID, transform type and target name (optional). If not found or in V1 cases it
   * creates a new PartitionField.
   *
   * @param sourceTransform pair of source ID and transform for this PartitionField addition
   * @param name target partition field name, if specified
   * @return the recycled or newly created partition field
   */
  private PartitionField recycleOrCreatePartitionField(
      Pair> sourceTransform, String name) {
    if (formatVersion >= 2 && base != null) {
      int sourceId = sourceTransform.first();
      Transform transform = sourceTransform.second();

      Set allHistoricalFields = Sets.newHashSet();
      for (PartitionSpec partitionSpec : base.specs()) {
        allHistoricalFields.addAll(partitionSpec.fields());
      }

      for (PartitionField field : allHistoricalFields) {
        if (field.sourceId() == sourceId && field.transform().equals(transform)) {
          // if target name is specified then consider it too, otherwise not
          if (name == null || field.name().equals(name)) {
            return field;
          }
        }
      }
    }
    return new PartitionField(
        sourceTransform.first(), assignFieldId(), name, sourceTransform.second());
  }

  @Override
  public UpdatePartitionSpec caseSensitive(boolean isCaseSensitive) {
    this.caseSensitive = isCaseSensitive;
    return this;
  }

  @Override
  public UpdatePartitionSpec addNonDefaultSpec() {
    this.setAsDefault = false;
    return this;
  }

  @Override
  public BaseUpdatePartitionSpec addField(String sourceName) {
    return addField(Expressions.ref(sourceName));
  }

  @Override
  public BaseUpdatePartitionSpec addField(Term term) {
    return addField(null, term);
  }

  private BaseUpdatePartitionSpec rewriteDeleteAndAddField(PartitionField existing, String name) {
    deletes.remove(existing.fieldId());
    if (name == null || existing.name().equals(name)) {
      return this;
    } else {
      return renameField(existing.name(), name);
    }
  }

  @Override
  public BaseUpdatePartitionSpec addField(String name, Term term) {
    PartitionField alreadyAdded = nameToAddedField.get(name);
    Preconditions.checkArgument(
        alreadyAdded == null, "Cannot add duplicate partition field: %s", alreadyAdded);

    Pair> sourceTransform = resolve(term);
    Pair validationKey =
        Pair.of(sourceTransform.first(), sourceTransform.second().toString());

    PartitionField existing = transformToField.get(validationKey);
    if (existing != null
        && deletes.contains(existing.fieldId())
        && existing.transform().equals(sourceTransform.second())) {
      return rewriteDeleteAndAddField(existing, name);
    }

    Preconditions.checkArgument(
        existing == null
            || (deletes.contains(existing.fieldId())
                && !existing.transform().toString().equals(sourceTransform.second().toString())),
        "Cannot add duplicate partition field %s=%s, conflicts with %s",
        name,
        term,
        existing);

    PartitionField added = transformToAddedField.get(validationKey);
    Preconditions.checkArgument(
        added == null,
        "Cannot add duplicate partition field %s=%s, already added: %s",
        name,
        term,
        added);

    PartitionField newField = recycleOrCreatePartitionField(sourceTransform, name);
    if (newField.name() == null) {
      String partitionName =
          PartitionSpecVisitor.visit(schema, newField, PartitionNameGenerator.INSTANCE);
      newField =
          new PartitionField(
              newField.sourceId(), newField.fieldId(), partitionName, newField.transform());
    }

    checkForRedundantAddedPartitions(newField);
    transformToAddedField.put(validationKey, newField);

    PartitionField existingField = nameToField.get(newField.name());
    if (existingField != null && !deletes.contains(existingField.fieldId())) {
      if (isVoidTransform(existingField)) {
        // rename the old deleted field that is being replaced by the new field
        renameField(existingField.name(), existingField.name() + "_" + existingField.fieldId());
      } else {
        throw new IllegalArgumentException(
            String.format("Cannot add duplicate partition field name: %s", name));
      }
    } else if (existingField != null && deletes.contains(existingField.fieldId())) {
      renames.put(existingField.name(), existingField.name() + "_" + existingField.fieldId());
    }

    nameToAddedField.put(newField.name(), newField);

    adds.add(newField);

    return this;
  }

  @Override
  public BaseUpdatePartitionSpec removeField(String name) {
    PartitionField alreadyAdded = nameToAddedField.get(name);
    Preconditions.checkArgument(
        alreadyAdded == null, "Cannot delete newly added field: %s", alreadyAdded);

    Preconditions.checkArgument(
        renames.get(name) == null, "Cannot rename and delete partition field: %s", name);

    PartitionField field = nameToField.get(name);
    Preconditions.checkArgument(field != null, "Cannot find partition field to remove: %s", name);

    deletes.add(field.fieldId());

    return this;
  }

  @Override
  public BaseUpdatePartitionSpec removeField(Term term) {
    Pair> sourceTransform = resolve(term);
    Pair key =
        Pair.of(sourceTransform.first(), sourceTransform.second().toString());

    PartitionField added = transformToAddedField.get(key);
    Preconditions.checkArgument(added == null, "Cannot delete newly added field: %s", added);

    PartitionField field = transformToField.get(key);
    Preconditions.checkArgument(field != null, "Cannot find partition field to remove: %s", term);
    Preconditions.checkArgument(
        renames.get(field.name()) == null,
        "Cannot rename and delete partition field: %s",
        field.name());

    deletes.add(field.fieldId());

    return this;
  }

  @Override
  public BaseUpdatePartitionSpec renameField(String name, String newName) {
    PartitionField existingField = nameToField.get(newName);
    if (existingField != null && isVoidTransform(existingField)) {
      // rename the old deleted field that is being replaced by the new field
      renameField(existingField.name(), existingField.name() + "_" + existingField.fieldId());
    }

    PartitionField added = nameToAddedField.get(name);
    Preconditions.checkArgument(
        added == null, "Cannot rename newly added partition field: %s", name);

    PartitionField field = nameToField.get(name);
    Preconditions.checkArgument(field != null, "Cannot find partition field to rename: %s", name);
    Preconditions.checkArgument(
        !deletes.contains(field.fieldId()), "Cannot delete and rename partition field: %s", name);

    renames.put(name, newName);

    return this;
  }

  @Override
  public PartitionSpec apply() {
    PartitionSpec.Builder builder = PartitionSpec.builderFor(schema);

    for (PartitionField field : spec.fields()) {
      if (!deletes.contains(field.fieldId())) {
        String newName = renames.get(field.name());
        if (newName != null) {
          builder.add(field.sourceId(), field.fieldId(), newName, field.transform());
        } else {
          builder.add(field.sourceId(), field.fieldId(), field.name(), field.transform());
        }
      } else if (formatVersion < 2) {
        // field IDs were not required for v1 and were assigned sequentially in each partition spec
        // starting at 1,000.
        // to maintain consistent field ids across partition specs in v1 tables, any partition field
        // that is removed
        // must be replaced with a null transform. null values are always allowed in partition data.
        String newName = renames.get(field.name());
        if (newName != null) {
          builder.add(field.sourceId(), field.fieldId(), newName, Transforms.alwaysNull());
        } else {
          builder.add(field.sourceId(), field.fieldId(), field.name(), Transforms.alwaysNull());
        }
      }
    }

    for (PartitionField newField : adds) {
      builder.add(newField.sourceId(), newField.fieldId(), newField.name(), newField.transform());
    }

    return builder.build();
  }

  @Override
  public void commit() {
    TableMetadata update;
    if (setAsDefault) {
      update = base.updatePartitionSpec(apply());
    } else {
      update = base.addPartitionSpec(apply());
    }
    ops.commit(base, update);
  }

  private Pair> resolve(Term term) {
    Preconditions.checkArgument(term instanceof UnboundTerm, "Term must be unbound");

    BoundTerm boundTerm = ((UnboundTerm) term).bind(schema.asStruct(), caseSensitive);
    int sourceId = boundTerm.ref().fieldId();
    Transform transform = toTransform(boundTerm);

    Type fieldType = schema.findType(sourceId);
    if (fieldType != null) {
      transform = Transforms.fromString(fieldType, transform.toString());
    } else {
      transform = Transforms.fromString(transform.toString());
    }
    return Pair.of(sourceId, transform);
  }

  private Transform toTransform(BoundTerm term) {
    if (term instanceof BoundReference) {
      return Transforms.identity();
    } else if (term instanceof BoundTransform) {
      return ((BoundTransform) term).transform();
    } else {
      throw new ValidationException(
          "Invalid term: %s, expected either a bound reference or transform", term);
    }
  }

  private void checkForRedundantAddedPartitions(PartitionField field) {
    if (isTimeTransform(field)) {
      PartitionField timeField = addedTimeFields.get(field.sourceId());
      Preconditions.checkArgument(
          timeField == null,
          "Cannot add redundant partition field: %s conflicts with %s",
          timeField,
          field);
      addedTimeFields.put(field.sourceId(), field);
    }
  }

  private static Map indexSpecByName(PartitionSpec spec) {
    ImmutableMap.Builder builder = ImmutableMap.builder();
    List fields = spec.fields();
    for (PartitionField field : fields) {
      builder.put(field.name(), field);
    }

    return builder.build();
  }

  private static Map, PartitionField> indexSpecByTransform(
      PartitionSpec spec) {
    Map, PartitionField> indexSpecs = Maps.newHashMap();
    List fields = spec.fields();
    for (PartitionField field : fields) {
      indexSpecs.put(Pair.of(field.sourceId(), field.transform().toString()), field);
    }

    return indexSpecs;
  }

  private boolean isTimeTransform(PartitionField field) {
    return PartitionSpecVisitor.visit(schema, field, IsTimeTransform.INSTANCE);
  }

  private static class IsTimeTransform implements PartitionSpecVisitor {
    private static final IsTimeTransform INSTANCE = new IsTimeTransform();

    private IsTimeTransform() {}

    @Override
    public Boolean identity(int fieldId, String sourceName, int sourceId) {
      return false;
    }

    @Override
    public Boolean bucket(int fieldId, String sourceName, int sourceId, int numBuckets) {
      return false;
    }

    @Override
    public Boolean truncate(int fieldId, String sourceName, int sourceId, int width) {
      return false;
    }

    @Override
    public Boolean year(int fieldId, String sourceName, int sourceId) {
      return true;
    }

    @Override
    public Boolean month(int fieldId, String sourceName, int sourceId) {
      return true;
    }

    @Override
    public Boolean day(int fieldId, String sourceName, int sourceId) {
      return true;
    }

    @Override
    public Boolean hour(int fieldId, String sourceName, int sourceId) {
      return true;
    }

    @Override
    public Boolean alwaysNull(int fieldId, String sourceName, int sourceId) {
      return false;
    }

    @Override
    public Boolean unknown(int fieldId, String sourceName, int sourceId, String transform) {
      return false;
    }
  }

  private boolean isVoidTransform(PartitionField field) {
    return PartitionSpecVisitor.visit(schema, field, IsVoidTransform.INSTANCE);
  }

  private static class IsVoidTransform implements PartitionSpecVisitor {
    private static final IsVoidTransform INSTANCE = new IsVoidTransform();

    private IsVoidTransform() {}

    @Override
    public Boolean identity(int fieldId, String sourceName, int sourceId) {
      return false;
    }

    @Override
    public Boolean bucket(int fieldId, String sourceName, int sourceId, int numBuckets) {
      return false;
    }

    @Override
    public Boolean truncate(int fieldId, String sourceName, int sourceId, int width) {
      return false;
    }

    @Override
    public Boolean year(int fieldId, String sourceName, int sourceId) {
      return false;
    }

    @Override
    public Boolean month(int fieldId, String sourceName, int sourceId) {
      return false;
    }

    @Override
    public Boolean day(int fieldId, String sourceName, int sourceId) {
      return false;
    }

    @Override
    public Boolean hour(int fieldId, String sourceName, int sourceId) {
      return false;
    }

    @Override
    public Boolean alwaysNull(int fieldId, String sourceName, int sourceId) {
      return true;
    }

    @Override
    public Boolean unknown(int fieldId, String sourceName, int sourceId, String transform) {
      return false;
    }
  }

  private static class PartitionNameGenerator implements PartitionSpecVisitor {
    private static final PartitionNameGenerator INSTANCE = new PartitionNameGenerator();

    private PartitionNameGenerator() {}

    @Override
    public String identity(int fieldId, String sourceName, int sourceId) {
      return sourceName;
    }

    @Override
    public String bucket(int fieldId, String sourceName, int sourceId, int numBuckets) {
      return sourceName + "_bucket_" + numBuckets;
    }

    @Override
    public String truncate(int fieldId, String sourceName, int sourceId, int width) {
      return sourceName + "_trunc_" + width;
    }

    @Override
    public String year(int fieldId, String sourceName, int sourceId) {
      return sourceName + "_year";
    }

    @Override
    public String month(int fieldId, String sourceName, int sourceId) {
      return sourceName + "_month";
    }

    @Override
    public String day(int fieldId, String sourceName, int sourceId) {
      return sourceName + "_day";
    }

    @Override
    public String hour(int fieldId, String sourceName, int sourceId) {
      return sourceName + "_hour";
    }

    @Override
    public String alwaysNull(int fieldId, String sourceName, int sourceId) {
      return sourceName + "_null";
    }
  }
}