All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.Partitioning Maven / Gradle / Ivy

There is a newer version: 1.6.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg;

import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.relocated.com.google.common.collect.FluentIterable;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.transforms.PartitionSpecVisitor;
import org.apache.iceberg.transforms.Transform;
import org.apache.iceberg.transforms.Transforms;
import org.apache.iceberg.transforms.UnknownTransform;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types.NestedField;
import org.apache.iceberg.types.Types.StructType;

public class Partitioning {
  private Partitioning() {}

  /**
   * Check whether the spec contains a bucketed partition field.
   *
   * @param spec a partition spec
   * @return true if the spec has field with a bucket transform
   */
  public static boolean hasBucketField(PartitionSpec spec) {
    List bucketList =
        PartitionSpecVisitor.visit(
            spec,
            new PartitionSpecVisitor() {
              @Override
              public Boolean identity(int fieldId, String sourceName, int sourceId) {
                return false;
              }

              @Override
              public Boolean bucket(int fieldId, String sourceName, int sourceId, int width) {
                return true;
              }

              @Override
              public Boolean truncate(int fieldId, String sourceName, int sourceId, int width) {
                return false;
              }

              @Override
              public Boolean year(int fieldId, String sourceName, int sourceId) {
                return false;
              }

              @Override
              public Boolean month(int fieldId, String sourceName, int sourceId) {
                return false;
              }

              @Override
              public Boolean day(int fieldId, String sourceName, int sourceId) {
                return false;
              }

              @Override
              public Boolean hour(int fieldId, String sourceName, int sourceId) {
                return false;
              }

              @Override
              public Boolean alwaysNull(int fieldId, String sourceName, int sourceId) {
                return false;
              }

              @Override
              public Boolean unknown(
                  int fieldId, String sourceName, int sourceId, String transform) {
                return false;
              }
            });

    return bucketList.stream().anyMatch(Boolean::booleanValue);
  }

  /**
   * Create a sort order that will group data for a partition spec.
   *
   * 

If the partition spec contains bucket columns, the sort order will also have a field to sort * by a column that is bucketed in the spec. The column is selected by the highest number of * buckets in the transform. * * @param spec a partition spec * @return a sort order that will cluster data for the spec */ public static SortOrder sortOrderFor(PartitionSpec spec) { if (spec.isUnpartitioned()) { return SortOrder.unsorted(); } SortOrder.Builder builder = SortOrder.builderFor(spec.schema()); SpecToOrderVisitor converter = new SpecToOrderVisitor(builder); PartitionSpecVisitor.visit(spec, converter); // columns used for bucketing are high cardinality; add one to the sort at the end String bucketColumn = converter.bucketColumn(); if (bucketColumn != null) { builder.asc(bucketColumn); } return builder.build(); } private static class SpecToOrderVisitor implements PartitionSpecVisitor { private final SortOrder.Builder builder; private String bucketColumn = null; private int highestNumBuckets = 0; private SpecToOrderVisitor(SortOrder.Builder builder) { this.builder = builder; } String bucketColumn() { return bucketColumn; } @Override public Void identity(int fieldId, String sourceName, int sourceId) { builder.asc(sourceName); return null; } @Override public Void bucket(int fieldId, String sourceName, int sourceId, int numBuckets) { // the column with highest cardinality is usually the one with the highest number of buckets if (numBuckets > highestNumBuckets) { this.highestNumBuckets = numBuckets; this.bucketColumn = sourceName; } builder.asc(Expressions.bucket(sourceName, numBuckets)); return null; } @Override public Void truncate(int fieldId, String sourceName, int sourceId, int width) { builder.asc(Expressions.truncate(sourceName, width)); return null; } @Override public Void year(int fieldId, String sourceName, int sourceId) { builder.asc(Expressions.year(sourceName)); return null; } @Override public Void month(int fieldId, String sourceName, int sourceId) { builder.asc(Expressions.month(sourceName)); return null; } @Override public Void day(int fieldId, String sourceName, int sourceId) { builder.asc(Expressions.day(sourceName)); return null; } @Override public Void hour(int fieldId, String sourceName, int sourceId) { builder.asc(Expressions.hour(sourceName)); return null; } @Override public Void alwaysNull(int fieldId, String sourceName, int sourceId) { // do nothing for alwaysNull, it doesn't need to be added to the sort return null; } } /** * Builds a grouping key type considering the provided schema and specs. * *

A grouping key defines how data is split between files and consists of partition fields with * non-void transforms that are present in each provided spec. Iceberg guarantees that records * with different values for the grouping key are disjoint and are stored in separate files. * *

If there is only one spec, the grouping key will include all partition fields with non-void * transforms from that spec. Whenever there are multiple specs, the grouping key will represent * an intersection of all partition fields with non-void transforms. If a partition field is * present only in a subset of specs, Iceberg cannot guarantee data distribution on that field. * That's why it will not be part of the grouping key. Unpartitioned tables or tables with * non-overlapping specs have empty grouping keys. * *

When partition fields are dropped in v1 tables, they are replaced with new partition fields * that have the same field ID but use a void transform under the hood. Such fields cannot be part * of the grouping key as void transforms always return null. * *

If the provided schema is not null, this method will only take into account partition fields * on top of columns present in the schema. Otherwise, all partition fields will be considered. * * @param schema a schema specifying a set of source columns to consider (null to consider all) * @param specs one or many specs * @return the constructed grouping key type */ public static StructType groupingKeyType(Schema schema, Collection specs) { return buildPartitionProjectionType("grouping key", specs, commonActiveFieldIds(schema, specs)); } /** * Builds a unified partition type considering all specs in a table. * *

If there is only one spec, the partition type is that spec's partition type. Whenever there * are multiple specs, the partition type is a struct containing all fields that have ever been a * part of any spec in the table. In other words, the struct fields represent a union of all known * partition fields. * * @param table a table with one or many specs * @return the constructed unified partition type */ public static StructType partitionType(Table table) { Collection specs = table.specs().values(); return buildPartitionProjectionType("table partition", specs, allFieldIds(specs)); } private static StructType buildPartitionProjectionType( String typeName, Collection specs, Set projectedFieldIds) { // we currently don't know the output type of unknown transforms List> unknownTransforms = collectUnknownTransforms(specs); ValidationException.check( unknownTransforms.isEmpty(), "Cannot build %s type, unknown transforms: %s", typeName, unknownTransforms); Map fieldMap = Maps.newHashMap(); Map typeMap = Maps.newHashMap(); Map nameMap = Maps.newHashMap(); // sort specs by ID in descending order to pick up the most recent field names List sortedSpecs = specs.stream() .sorted(Comparator.comparingLong(PartitionSpec::specId).reversed()) .collect(Collectors.toList()); for (PartitionSpec spec : sortedSpecs) { for (PartitionField field : spec.fields()) { int fieldId = field.fieldId(); if (!projectedFieldIds.contains(fieldId)) { continue; } NestedField structField = spec.partitionType().field(fieldId); PartitionField existingField = fieldMap.get(fieldId); if (existingField == null) { fieldMap.put(fieldId, field); typeMap.put(fieldId, structField.type()); nameMap.put(fieldId, structField.name()); } else { // verify the fields are compatible as they may conflict in v1 tables ValidationException.check( equivalentIgnoringNames(field, existingField), "Conflicting partition fields: ['%s', '%s']", field, existingField); // use the correct type for dropped partitions in v1 tables if (isVoidTransform(existingField) && !isVoidTransform(field)) { fieldMap.put(fieldId, field); typeMap.put(fieldId, structField.type()); } } } } List sortedStructFields = fieldMap.keySet().stream() .sorted(Comparator.naturalOrder()) .map( fieldId -> NestedField.optional(fieldId, nameMap.get(fieldId), typeMap.get(fieldId))) .collect(Collectors.toList()); return StructType.of(sortedStructFields); } private static boolean isVoidTransform(PartitionField field) { return field.transform().equals(Transforms.alwaysNull()); } private static List> collectUnknownTransforms(Collection specs) { List> unknownTransforms = Lists.newArrayList(); for (PartitionSpec spec : specs) { spec.fields().stream() .map(PartitionField::transform) .filter(transform -> transform instanceof UnknownTransform) .forEach(unknownTransforms::add); } return unknownTransforms; } private static boolean equivalentIgnoringNames( PartitionField field, PartitionField anotherField) { return field.fieldId() == anotherField.fieldId() && field.sourceId() == anotherField.sourceId() && compatibleTransforms(field.transform(), anotherField.transform()); } private static boolean compatibleTransforms(Transform t1, Transform t2) { return t1.equals(t2) || t1.equals(Transforms.alwaysNull()) || t2.equals(Transforms.alwaysNull()); } // collects IDs of all partition field used across specs private static Set allFieldIds(Collection specs) { return FluentIterable.from(specs) .transformAndConcat(PartitionSpec::fields) .transform(PartitionField::fieldId) .toSet(); } // collects IDs of partition fields with non-void transforms that are present in each spec private static Set commonActiveFieldIds(Schema schema, Collection specs) { Set commonActiveFieldIds = Sets.newHashSet(); int specIndex = 0; for (PartitionSpec spec : specs) { if (specIndex == 0) { commonActiveFieldIds.addAll(activeFieldIds(schema, spec)); } else { commonActiveFieldIds.retainAll(activeFieldIds(schema, spec)); } specIndex++; } return commonActiveFieldIds; } private static List activeFieldIds(Schema schema, PartitionSpec spec) { return spec.fields().stream() .filter(field -> schema == null || schema.findField(field.sourceId()) != null) .filter(field -> !isVoidTransform(field)) .map(PartitionField::fieldId) .collect(Collectors.toList()); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy