All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.Schema Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg;

import java.io.Serializable;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.iceberg.relocated.com.google.common.base.Joiner;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.BiMap;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableBiMap;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.relocated.com.google.common.primitives.Ints;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.types.Types.NestedField;
import org.apache.iceberg.types.Types.StructType;

/**
 * The schema of a data table.
 * 

* Schema ID will only be populated when reading from/writing to table metadata, * otherwise it will be default to 0. */ public class Schema implements Serializable { private static final Joiner NEWLINE = Joiner.on('\n'); private static final String ALL_COLUMNS = "*"; private static final int DEFAULT_SCHEMA_ID = 0; private final StructType struct; private final int schemaId; private final int[] identifierFieldIds; private transient BiMap aliasToId = null; private transient Map idToField = null; private transient Map nameToId = null; private transient Map lowerCaseNameToId = null; private transient Map> idToAccessor = null; private transient Map idToName = null; private transient Set identifierFieldIdSet = null; public Schema(List columns, Map aliases) { this(columns, aliases, ImmutableSet.of()); } public Schema(List columns, Map aliases, Set identifierFieldIds) { this(DEFAULT_SCHEMA_ID, columns, aliases, identifierFieldIds); } public Schema(List columns) { this(columns, ImmutableSet.of()); } public Schema(List columns, Set identifierFieldIds) { this(DEFAULT_SCHEMA_ID, columns, identifierFieldIds); } public Schema(int schemaId, List columns) { this(schemaId, columns, ImmutableSet.of()); } public Schema(int schemaId, List columns, Set identifierFieldIds) { this(schemaId, columns, null, identifierFieldIds); } public Schema(int schemaId, List columns, Map aliases, Set identifierFieldIds) { this.schemaId = schemaId; this.struct = StructType.of(columns); this.aliasToId = aliases != null ? ImmutableBiMap.copyOf(aliases) : null; // validate IdentifierField if (identifierFieldIds != null) { Map idToParent = TypeUtil.indexParents(struct); identifierFieldIds.forEach(id -> validateIdentifierField(id, lazyIdToField(), idToParent)); } this.identifierFieldIds = identifierFieldIds != null ? Ints.toArray(identifierFieldIds) : new int[0]; lazyIdToName(); } static void validateIdentifierField(int fieldId, Map idToField, Map idToParent) { Types.NestedField field = idToField.get(fieldId); Preconditions.checkArgument(field.type().isPrimitiveType(), "Cannot add field %s as an identifier field: not a primitive type field", field.name()); Preconditions.checkArgument(field.isRequired(), "Cannot add field %s as an identifier field: not a required field", field.name()); Preconditions.checkArgument(!Types.DoubleType.get().equals(field.type()) && !Types.FloatType.get().equals(field.type()), "Cannot add field %s as an identifier field: must not be float or double field", field.name()); // check whether the nested field is in a chain of struct fields Integer parentId = idToParent.get(field.fieldId()); while (parentId != null) { Types.NestedField parent = idToField.get(parentId); Preconditions.checkArgument(parent.type().isStructType(), "Cannot add field %s as an identifier field: must not be nested in %s", field.name(), parent); parentId = idToParent.get(parent.fieldId()); } } public Schema(NestedField... columns) { this(DEFAULT_SCHEMA_ID, Arrays.asList(columns)); } public Schema(int schemaId, NestedField... columns) { this(schemaId, Arrays.asList(columns)); } private Map lazyIdToField() { if (idToField == null) { this.idToField = TypeUtil.indexById(struct); } return idToField; } private Map lazyNameToId() { if (nameToId == null) { this.nameToId = ImmutableMap.copyOf(TypeUtil.indexByName(struct)); } return nameToId; } private Map lazyIdToName() { if (idToName == null) { this.idToName = ImmutableMap.copyOf(TypeUtil.indexNameById(struct)); } return idToName; } private Map lazyLowerCaseNameToId() { if (lowerCaseNameToId == null) { this.lowerCaseNameToId = ImmutableMap.copyOf(TypeUtil.indexByLowerCaseName(struct)); } return lowerCaseNameToId; } private Map> lazyIdToAccessor() { if (idToAccessor == null) { idToAccessor = Accessors.forSchema(this); } return idToAccessor; } private Set lazyIdentifierFieldIdSet() { if (identifierFieldIdSet == null) { identifierFieldIdSet = ImmutableSet.copyOf(Ints.asList(identifierFieldIds)); } return identifierFieldIdSet; } /** * Returns the schema ID for this schema. *

* Note that schema ID will only be populated when reading from/writing to table metadata, * otherwise it will be default to 0. */ public int schemaId() { return this.schemaId; } /** * Returns an alias map for this schema, if set. *

* Alias maps are created when translating an external schema, like an Avro Schema, to this * format. The original column names can be provided in a Map when constructing this Schema. * * @return a Map of column aliases to field ids */ public Map getAliases() { return aliasToId; } /** * Returns the underlying {@link StructType struct type} for this schema. * * @return the StructType version of this schema. */ public StructType asStruct() { return struct; } /** * Returns a List of the {@link NestedField columns} in this Schema. */ public List columns() { return struct.fields(); } /** * The set of identifier field IDs. *

* Identifier is a concept similar to primary key in a relational database system. * It consists of a unique set of primitive fields in the schema. * An identifier field must be at root, or nested in a chain of structs (no maps or lists). * A row should be unique in a table based on the values of the identifier fields. * Optional, float and double columns cannot be used as identifier fields. * However, Iceberg identifier differs from primary key in the following ways: *

    *
  • Iceberg does not enforce the uniqueness of a row based on this identifier information. * It is used for operations like upsert to define the default upsert key.
  • *
  • A nested field in a struct can be used as an identifier. For example, if there is a "last_name" field * inside a "user" struct in a schema, field "user.last_name" can be set as a part of the identifier field.
  • *
*

* * @return the set of identifier field IDs in this schema. */ public Set identifierFieldIds() { return lazyIdentifierFieldIdSet(); } /** * Returns the set of identifier field names. */ public Set identifierFieldNames() { return identifierFieldIds() .stream() .map(id -> lazyIdToName().get(id)) .collect(Collectors.toSet()); } /** * Returns the {@link Type} of a sub-field identified by the field name. * * @param name a field name * @return a Type for the sub-field or null if it is not found */ public Type findType(String name) { Preconditions.checkArgument(!name.isEmpty(), "Invalid column name: (empty)"); Integer id = lazyNameToId().get(name); if (id != null) { // name is found return findType(id); } // name could not be found return null; } /** * Returns the {@link Type} of a sub-field identified by the field id. * * @param id a field id * @return a Type for the sub-field or null if it is not found */ public Type findType(int id) { NestedField field = lazyIdToField().get(id); if (field != null) { return field.type(); } return null; } /** * Returns the sub-field identified by the field id as a {@link NestedField}. * * @param id a field id * @return the sub-field or null if it is not found */ public NestedField findField(int id) { return lazyIdToField().get(id); } /** * Returns a sub-field by name as a {@link NestedField}. *

* The result may be a top-level or a nested field. * * @param name a String name * @return a Type for the sub-field or null if it is not found */ public NestedField findField(String name) { Preconditions.checkArgument(!name.isEmpty(), "Invalid column name: (empty)"); Integer id = lazyNameToId().get(name); if (id != null) { return lazyIdToField().get(id); } return null; } /** * Returns a sub-field by name as a {@link NestedField}. *

* The result may be a top-level or a nested field. * * @param name a String name * @return the sub-field or null if it is not found */ public NestedField caseInsensitiveFindField(String name) { Preconditions.checkArgument(!name.isEmpty(), "Invalid column name: (empty)"); Integer id = lazyLowerCaseNameToId().get(name.toLowerCase(Locale.ROOT)); if (id != null) { return lazyIdToField().get(id); } return null; } /** * Returns the full column name for the given id. * * @param id a field id * @return the full column name in this schema that resolves to the id */ public String findColumnName(int id) { return lazyIdToName().get(id); } /** * Returns the column id for the given column alias. Column aliases are set * by conversions from Parquet or Avro to this Schema type. * * @param alias a full column name in the unconverted data schema * @return the column id in this schema, or null if the column wasn't found */ public Integer aliasToId(String alias) { if (aliasToId != null) { return aliasToId.get(alias); } return null; } /** * Returns the full column name in the unconverted data schema for the given column id. * Column aliases are set by conversions from Parquet or Avro to this Schema type. * * @param fieldId a column id in this schema * @return the full column name in the unconverted data schema, or null if one wasn't found */ public String idToAlias(Integer fieldId) { if (aliasToId != null) { return aliasToId.inverse().get(fieldId); } return null; } /** * Returns an accessor for retrieving the data from {@link StructLike}. *

* Accessors do not retrieve data contained in lists or maps. * * @param id a column id in this schema * @return an {@link Accessor} to retrieve values from a {@link StructLike} row */ public Accessor accessorForField(int id) { return lazyIdToAccessor().get(id); } /** * Creates a projection schema for a subset of columns, selected by name. *

* Names that identify nested fields will select part or all of the field's top-level column. * * @param names String names for selected columns * @return a projection schema from this schema, by name */ public Schema select(String... names) { return select(Arrays.asList(names)); } /** * Creates a projection schema for a subset of columns, selected by name. *

* Names that identify nested fields will select part or all of the field's top-level column. * * @param names a List of String names for selected columns * @return a projection schema from this schema, by name */ public Schema select(Collection names) { return internalSelect(names, true); } /** * Creates a projection schema for a subset of columns, selected by case insensitive names *

* Names that identify nested fields will select part or all of the field's top-level column. * * @param names a List of String names for selected columns * @return a projection schema from this schema, by names */ public Schema caseInsensitiveSelect(String... names) { return caseInsensitiveSelect(Arrays.asList(names)); } /** * Creates a projection schema for a subset of columns, selected by case insensitive names *

* Names that identify nested fields will select part or all of the field's top-level column. * * @param names a List of String names for selected columns * @return a projection schema from this schema, by names */ public Schema caseInsensitiveSelect(Collection names) { return internalSelect(names, false); } /** * Checks whether this schema is equivalent to another schema while ignoring the schema ID. * @param anotherSchema another schema * @return true if this schema is equivalent to the given schema */ public boolean sameSchema(Schema anotherSchema) { return asStruct().equals(anotherSchema.asStruct()) && identifierFieldIds().equals(anotherSchema.identifierFieldIds()); } private Schema internalSelect(Collection names, boolean caseSensitive) { if (names.contains(ALL_COLUMNS)) { return this; } Set selected = Sets.newHashSet(); for (String name : names) { Integer id; if (caseSensitive) { id = lazyNameToId().get(name); } else { id = lazyLowerCaseNameToId().get(name.toLowerCase(Locale.ROOT)); } if (id != null) { selected.add(id); } } return TypeUtil.select(this, selected); } private String identifierFieldToString(Types.NestedField field) { return " " + field + (identifierFieldIds().contains(field.fieldId()) ? " (id)" : ""); } @Override public String toString() { return String.format("table {\n%s\n}", NEWLINE.join(struct.fields().stream() .map(this::identifierFieldToString) .collect(Collectors.toList()))); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy