All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.orc.ORCSchemaUtil Maven / Gradle / Ivy

There is a newer version: 1.0.0.5
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg.orc;

import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.iceberg.Schema;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMultimap;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
import org.apache.orc.TypeDescription;


/**
 * Utilities for mapping Iceberg to ORC schemas.
 */
public final class ORCSchemaUtil {

  public enum BinaryType {
    UUID, FIXED, BINARY
  }

  public enum LongType {
    TIME, LONG
  }

  private static class OrcField {
    private final String name;
    private final TypeDescription type;

    OrcField(String name, TypeDescription type) {
      this.name = name;
      this.type = type;
    }

    public String name() {
      return name;
    }

    public TypeDescription type() {
      return type;
    }
  }

  static final String ICEBERG_ID_ATTRIBUTE = "iceberg.id";
  static final String ICEBERG_REQUIRED_ATTRIBUTE = "iceberg.required";

  /**
   * The name of the ORC {@link TypeDescription} attribute indicating the Iceberg type corresponding to an
   * ORC binary type. The values for this attribute are denoted in {@code BinaryType}.
   */
  public static final String ICEBERG_BINARY_TYPE_ATTRIBUTE = "iceberg.binary-type";
  /**
   * The name of the ORC {@link TypeDescription} attribute indicating the Iceberg type corresponding to an
   * ORC long type. The values for this attribute are denoted in {@code LongType}.
   */
  public static final String ICEBERG_LONG_TYPE_ATTRIBUTE = "iceberg.long-type";
  static final String ICEBERG_FIELD_LENGTH = "iceberg.length";

  private static final ImmutableMultimap TYPE_MAPPING =
      ImmutableMultimap.builder()
          .put(Type.TypeID.BOOLEAN, TypeDescription.Category.BOOLEAN)
          .put(Type.TypeID.INTEGER, TypeDescription.Category.BYTE)
          .put(Type.TypeID.INTEGER, TypeDescription.Category.SHORT)
          .put(Type.TypeID.INTEGER, TypeDescription.Category.INT)
          .put(Type.TypeID.LONG, TypeDescription.Category.LONG)
          .put(Type.TypeID.TIME, TypeDescription.Category.LONG)
          .put(Type.TypeID.FLOAT, TypeDescription.Category.FLOAT)
          .put(Type.TypeID.DOUBLE, TypeDescription.Category.DOUBLE)
          .put(Type.TypeID.DATE, TypeDescription.Category.DATE)
          .put(Type.TypeID.STRING, TypeDescription.Category.CHAR)
          .put(Type.TypeID.STRING, TypeDescription.Category.VARCHAR)
          .put(Type.TypeID.STRING, TypeDescription.Category.STRING)
          .put(Type.TypeID.UUID, TypeDescription.Category.BINARY)
          .put(Type.TypeID.FIXED, TypeDescription.Category.BINARY)
          .put(Type.TypeID.BINARY, TypeDescription.Category.BINARY)
          .put(Type.TypeID.DECIMAL, TypeDescription.Category.DECIMAL)
          .build();

  private ORCSchemaUtil() {
  }

  public static TypeDescription convert(Schema schema) {
    final TypeDescription root = TypeDescription.createStruct();
    final Types.StructType schemaRoot = schema.asStruct();
    for (Types.NestedField field : schemaRoot.asStructType().fields()) {
      TypeDescription orcColumType = convert(field.fieldId(), field.type(), field.isRequired());
      root.addField(field.name(), orcColumType);
    }
    return root;
  }

  private static TypeDescription convert(Integer fieldId, Type type, boolean isRequired) {
    final TypeDescription orcType;

    switch (type.typeId()) {
      case BOOLEAN:
        orcType = TypeDescription.createBoolean();
        break;
      case INTEGER:
        orcType = TypeDescription.createInt();
        break;
      case TIME:
        orcType = TypeDescription.createLong();
        orcType.setAttribute(ICEBERG_LONG_TYPE_ATTRIBUTE, LongType.TIME.toString());
        break;
      case LONG:
        orcType = TypeDescription.createLong();
        orcType.setAttribute(ICEBERG_LONG_TYPE_ATTRIBUTE, LongType.LONG.toString());
        break;
      case FLOAT:
        orcType = TypeDescription.createFloat();
        break;
      case DOUBLE:
        orcType = TypeDescription.createDouble();
        break;
      case DATE:
        orcType = TypeDescription.createDate();
        break;
      case TIMESTAMP:
        Types.TimestampType tsType = (Types.TimestampType) type;
        if (tsType.shouldAdjustToUTC()) {
          orcType = TypeDescription.createTimestampInstant();
        } else {
          orcType = TypeDescription.createTimestamp();
        }
        break;
      case STRING:
        orcType = TypeDescription.createString();
        break;
      case UUID:
        orcType = TypeDescription.createBinary();
        orcType.setAttribute(ICEBERG_BINARY_TYPE_ATTRIBUTE, BinaryType.UUID.toString());
        break;
      case FIXED:
        orcType = TypeDescription.createBinary();
        orcType.setAttribute(ICEBERG_BINARY_TYPE_ATTRIBUTE, BinaryType.FIXED.toString());
        orcType.setAttribute(ICEBERG_FIELD_LENGTH, Integer.toString(((Types.FixedType) type).length()));
        break;
      case BINARY:
        orcType = TypeDescription.createBinary();
        orcType.setAttribute(ICEBERG_BINARY_TYPE_ATTRIBUTE, BinaryType.BINARY.toString());
        break;
      case DECIMAL: {
        Types.DecimalType decimal = (Types.DecimalType) type;
        orcType = TypeDescription.createDecimal()
            .withScale(decimal.scale())
            .withPrecision(decimal.precision());
        break;
      }
      case STRUCT: {
        orcType = TypeDescription.createStruct();
        for (Types.NestedField field : type.asStructType().fields()) {
          TypeDescription childType = convert(field.fieldId(), field.type(), field.isRequired());
          orcType.addField(field.name(), childType);
        }
        break;
      }
      case LIST: {
        Types.ListType list = (Types.ListType) type;
        TypeDescription elementType = convert(list.elementId(), list.elementType(),
            list.isElementRequired());
        orcType = TypeDescription.createList(elementType);
        break;
      }
      case MAP: {
        Types.MapType map = (Types.MapType) type;
        TypeDescription keyType = convert(map.keyId(), map.keyType(), true);
        TypeDescription valueType = convert(map.valueId(), map.valueType(), map.isValueRequired());
        orcType = TypeDescription.createMap(keyType, valueType);
        break;
      }
      default:
        throw new IllegalArgumentException("Unhandled type " + type.typeId());
    }

    // Set Iceberg column attributes for mapping
    orcType.setAttribute(ICEBERG_ID_ATTRIBUTE, String.valueOf(fieldId));
    orcType.setAttribute(ICEBERG_REQUIRED_ATTRIBUTE, String.valueOf(isRequired));
    return orcType;
  }

  /**
   * Convert an ORC schema to an Iceberg schema. This method handles the convertion from the original
   * Iceberg column mapping IDs if present in the ORC column attributes, otherwise, ORC columns with no
   * Iceberg IDs will be ignored and skipped in the conversion.
   *
   * @return the Iceberg schema
   * @throws IllegalArgumentException if ORC schema has no columns with Iceberg ID attributes
   */
  public static Schema convert(TypeDescription orcSchema) {
    List children = orcSchema.getChildren();
    List childrenNames = orcSchema.getFieldNames();
    Preconditions.checkState(children.size() == childrenNames.size(),
        "Error in ORC file, children fields and names do not match.");

    OrcToIcebergVisitor schemaConverter = new OrcToIcebergVisitor();
    List fields = OrcToIcebergVisitor.visitSchema(orcSchema, schemaConverter).stream()
        .filter(Optional::isPresent).map(Optional::get).collect(Collectors.toList());

    if (fields.size() == 0) {
      throw new IllegalArgumentException("ORC schema does not contain Iceberg IDs");
    }

    return new Schema(fields);
  }

  /**
   * Converts an Iceberg schema to a corresponding ORC schema within the context of an existing
   * ORC file schema.
   * This method also handles schema evolution from the original ORC file schema
   * to the given Iceberg schema. It builds the desired reader schema with the schema
   * evolution rules and pass that down to the ORC reader,
   * which would then use its schema evolution to map that to the writer’s schema.
   *
   * Example:
   * 
   * Iceberg writer                                        ORC writer
   * struct<a (1): int, b (2): string>                     struct<a: int, b: string>
   * struct<a (1): struct<b (2): string, c (3): date>>     struct<a: struct<b:string, c:date>>
   * 
   *
   * Iceberg reader                                        ORC reader
   * 
   * struct<a (2): string, c (3): date>                    struct<b: string, c: date>
   * struct<aa (1): struct<cc (3): date, bb (2): string>>  struct<a: struct<c:date, b:string>>
   * 
   *
   * @param schema an Iceberg schema
   * @param originalOrcSchema an existing ORC file schema
   * @return the resulting ORC schema
   */
  public static TypeDescription buildOrcProjection(Schema schema,
                                                   TypeDescription originalOrcSchema) {
    final Map icebergToOrc = icebergToOrcMapping("root", originalOrcSchema);
    return buildOrcProjection(Integer.MIN_VALUE, schema.asStruct(), true, icebergToOrc);
  }

  private static TypeDescription buildOrcProjection(Integer fieldId, Type type, boolean isRequired,
                                                    Map mapping) {
    final TypeDescription orcType;
    final OrcField orcField = mapping.getOrDefault(fieldId, null);

    switch (type.typeId()) {
      case STRUCT:
        orcType = buildOrcProjectForStructType(fieldId, type, isRequired, mapping);
        break;
      case LIST:
        orcType = buildOrcProjectionForListType((Types.ListType) type, isRequired, mapping, orcField);
        break;
      case MAP:
        orcType = buildOrcProjectionForMapType((Types.MapType) type, isRequired, mapping, orcField);
        break;
      default:
        if (mapping.containsKey(fieldId)) {
          TypeDescription originalType = mapping.get(fieldId).type();
          if (originalType != null && originalType.getCategory().equals(TypeDescription.Category.UNION)) {
            Preconditions.checkState(originalType.getChildren().size() == 1,
                "Expect single type union for orc schema.");
            orcType = originalType.clone();
          } else {
            Optional promotedType = getPromotedType(type, originalType);

            if (promotedType.isPresent()) {
              orcType = promotedType.get();
            } else {
              Preconditions.checkArgument(isSameType(originalType, type),
                  "Can not promote %s type to %s",
                  originalType.getCategory(), type.typeId().name());
              orcType = originalType.clone();
            }
          }
        } else {
          if (isRequired) {
            throw new IllegalArgumentException(
                String.format("Field %d of type %s is required and was not found.", fieldId, type.toString()));
          }

          orcType = convert(fieldId, type, false);
        }
    }
    orcType.setAttribute(ICEBERG_ID_ATTRIBUTE, fieldId.toString());
    return orcType;
  }

  private static TypeDescription buildOrcProjectionForMapType(Types.MapType type, boolean isRequired,
      Map mapping, OrcField orcField) {
    final TypeDescription orcType;
    if (orcField != null && orcField.type.getCategory().equals(TypeDescription.Category.UNION)) {
      Preconditions.checkState(orcField.type.getChildren().size() == 1,
          "Expect single type union for orc schema.");

      orcType = TypeDescription.createUnion();
      Types.MapType map = type;
      TypeDescription keyType = buildOrcProjection(map.keyId(), map.keyType(), isRequired, mapping);
      TypeDescription valueType = buildOrcProjection(map.valueId(), map.valueType(),
          isRequired && map.isValueRequired(), mapping);
      orcType.addUnionChild(TypeDescription.createMap(keyType, valueType));
    } else {
      Types.MapType map = type;
      TypeDescription keyType = buildOrcProjection(map.keyId(), map.keyType(), isRequired, mapping);
      TypeDescription valueType = buildOrcProjection(map.valueId(), map.valueType(),
          isRequired && map.isValueRequired(), mapping);
      orcType = TypeDescription.createMap(keyType, valueType);
    }
    return orcType;
  }

  private static TypeDescription buildOrcProjectionForListType(Types.ListType type, boolean isRequired,
      Map mapping, OrcField orcField) {
    final TypeDescription orcType;
    if (orcField != null && orcField.type.getCategory().equals(TypeDescription.Category.UNION)) {
      Preconditions.checkState(orcField.type.getChildren().size() == 1,
          "Expect single type union for orc schema.");

      orcType = TypeDescription.createUnion();
      Types.ListType list = type;
      TypeDescription elementType = buildOrcProjection(list.elementId(), list.elementType(),
          isRequired && list.isElementRequired(), mapping);
      orcType.addUnionChild(TypeDescription.createList(elementType));
    } else {
      Types.ListType list = type;
      TypeDescription elementType = buildOrcProjection(list.elementId(), list.elementType(),
          isRequired && list.isElementRequired(), mapping);
      orcType = TypeDescription.createList(elementType);
    }
    return orcType;
  }

  private static TypeDescription buildOrcProjectForStructType(Integer fieldId, Type type, boolean isRequired,
      Map mapping) {
    TypeDescription orcType;
    OrcField orcField = mapping.getOrDefault(fieldId, null);

    if (orcField != null && orcField.type.getCategory().equals(TypeDescription.Category.UNION)) {
      // this branch means the iceberg struct schema actually correspond to an underlying union
      orcType = getOrcSchemaForUnionType(type, isRequired, mapping, orcField);
    } else {
      orcType = TypeDescription.createStruct();
      for (Types.NestedField nestedField : type.asStructType().fields()) {
        // Using suffix _r to avoid potential underlying issues in ORC reader
        // with reused column names between ORC and Iceberg;
        // e.g. renaming column c -> d and adding new column d
        if (mapping.get(nestedField.fieldId()) == null && nestedField.hasDefaultValue()) {
          continue;
        }
        String name = Optional.ofNullable(mapping.get(nestedField.fieldId()))
            .map(OrcField::name)
            .orElseGet(() -> nestedField.name() + "_r" + nestedField.fieldId());
        TypeDescription childType = buildOrcProjection(nestedField.fieldId(), nestedField.type(),
            isRequired && nestedField.isRequired(), mapping);
        orcType.addField(name, childType);
      }
    }
    return orcType;
  }

  private static TypeDescription getOrcSchemaForUnionType(Type type, boolean isRequired, Map mapping,
      OrcField orcField) {
    TypeDescription orcType;
    if (orcField.type.getChildren().size() == 1) { // single type union
      orcType = TypeDescription.createUnion();

      TypeDescription childOrcStructType = TypeDescription.createStruct();
      for (Types.NestedField nestedField : type.asStructType().fields()) {
        if (mapping.get(nestedField.fieldId()) == null && nestedField.hasDefaultValue()) {
          continue;
        }
        String name = Optional.ofNullable(mapping.get(nestedField.fieldId()))
            .map(OrcField::name)
            .orElseGet(() -> nestedField.name());
        TypeDescription childType = buildOrcProjection(nestedField.fieldId(), nestedField.type(),
            isRequired && nestedField.isRequired(), mapping);
        childOrcStructType.addField(name, childType);
      }

      orcType.addUnionChild(childOrcStructType);
    } else { // complex union
      orcType = TypeDescription.createUnion();
      List nestedFields = type.asStructType().fields();
      for (Types.NestedField nestedField : nestedFields.subList(1, nestedFields.size())) {
        TypeDescription childType = buildOrcProjection(nestedField.fieldId(), nestedField.type(),
            isRequired && nestedField.isRequired(), mapping);
        orcType.addUnionChild(childType);
      }
    }
    return orcType;
  }

  private static Map icebergToOrcMapping(String name, TypeDescription orcType) {
    Map icebergToOrc = Maps.newHashMap();
    switch (orcType.getCategory()) {
      case STRUCT:
        List childrenNames = orcType.getFieldNames();
        List children = orcType.getChildren();
        for (int i = 0; i < children.size(); i++) {
          icebergToOrc.putAll(icebergToOrcMapping(childrenNames.get(i), children.get(i)));
        }
        break;
      case UNION:
        // This is part of building orc read schema in file level. orcType has union type inside it.
        List options = orcType.getChildren();
        for (int i = 0; i < options.size(); i++) {
          icebergToOrc.putAll(icebergToOrcMapping("option" + i, options.get(i)));
        }
        break;
      case LIST:
        icebergToOrc.putAll(icebergToOrcMapping("element", orcType.getChildren().get(0)));
        break;
      case MAP:
        icebergToOrc.putAll(icebergToOrcMapping("key", orcType.getChildren().get(0)));
        icebergToOrc.putAll(icebergToOrcMapping("value", orcType.getChildren().get(1)));
        break;
    }

    if (orcType.getId() > 0) {
      // Only add to non-root types.
      icebergID(orcType)
          .ifPresent(integer -> icebergToOrc.put(integer, new OrcField(name, orcType)));
    }

    return icebergToOrc;
  }


  private static Optional getPromotedType(Type icebergType,
                                                           TypeDescription originalOrcType) {
    TypeDescription promotedOrcType = null;
    if (Type.TypeID.LONG.equals(icebergType.typeId()) &&
        TypeDescription.Category.INT.equals(originalOrcType.getCategory())) {
      // Promote: int to long
      promotedOrcType = TypeDescription.createLong();
    } else if (Type.TypeID.DOUBLE.equals(icebergType.typeId()) &&
        TypeDescription.Category.FLOAT.equals(originalOrcType.getCategory())) {
      // Promote: float to double
      promotedOrcType = TypeDescription.createDouble();
    } else if (Type.TypeID.DECIMAL.equals(icebergType.typeId()) &&
        TypeDescription.Category.DECIMAL.equals(originalOrcType.getCategory())) {
      // Promote: decimal(P, S) to decimal(P', S) if P' > P
      Types.DecimalType newDecimal = (Types.DecimalType) icebergType;
      if (newDecimal.scale() == originalOrcType.getScale() &&
          newDecimal.precision() > originalOrcType.getPrecision()) {
        promotedOrcType = TypeDescription.createDecimal()
            .withScale(newDecimal.scale())
            .withPrecision(newDecimal.precision());
      }
    }
    return Optional.ofNullable(promotedOrcType);
  }

  private static boolean isSameType(TypeDescription orcType, Type icebergType) {
    if (icebergType.typeId() == Type.TypeID.TIMESTAMP) {
      Types.TimestampType tsType = (Types.TimestampType) icebergType;
      return Objects.equals(
          tsType.shouldAdjustToUTC() ? TypeDescription.Category.TIMESTAMP_INSTANT : TypeDescription.Category.TIMESTAMP,
          orcType.getCategory());
    } else {
      return TYPE_MAPPING.containsEntry(icebergType.typeId(), orcType.getCategory());
    }
  }

  static Optional icebergID(TypeDescription orcType) {
    return Optional.ofNullable(orcType.getAttributeValue(ICEBERG_ID_ATTRIBUTE))
        .map(Integer::parseInt);
  }

  public static int fieldId(TypeDescription orcType) {
    String idStr = orcType.getAttributeValue(ICEBERG_ID_ATTRIBUTE);
    Preconditions.checkNotNull(idStr, "Missing expected '%s' property", ICEBERG_ID_ATTRIBUTE);
    return Integer.parseInt(idStr);
  }

  static boolean isOptional(TypeDescription orcType) {
    String isRequiredStr = orcType.getAttributeValue(ICEBERG_REQUIRED_ATTRIBUTE);
    if (isRequiredStr != null) {
      return !Boolean.parseBoolean(isRequiredStr);
    }
    return true;
  }

  static TypeDescription removeIds(TypeDescription type) {
    return OrcSchemaVisitor.visit(type, new RemoveIds());
  }

  static boolean hasIds(TypeDescription orcSchema) {
    return OrcSchemaVisitor.visit(orcSchema, new HasIds());
  }

  static TypeDescription applyNameMapping(TypeDescription orcSchema, NameMapping nameMapping) {
    return OrcSchemaVisitor.visit(orcSchema, new ApplyNameMapping(nameMapping));
  }

  /**
   * Generates mapping from field IDs to ORC qualified names. See {@link IdToOrcName} for details.
   */
  public static Map idToOrcName(Schema schema) {
    return TypeUtil.visit(schema, new IdToOrcName());
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy