All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.table.format.InternalSchemaManager Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.table.format;

import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.common.table.timeline.InstantFileNameGenerator;
import org.apache.hudi.common.table.timeline.TimelineLayout;
import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
import org.apache.hudi.common.util.InternalSchemaCache;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.configuration.HadoopConfigurations;
import org.apache.hudi.configuration.OptionsResolver;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.Type;
import org.apache.hudi.internal.schema.Types;
import org.apache.hudi.internal.schema.action.InternalSchemaMerger;
import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
import org.apache.hudi.internal.schema.utils.InternalSchemaUtils;
import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;
import org.apache.hudi.util.AvroSchemaConverter;

import org.apache.flink.configuration.Configuration;
import org.apache.flink.table.types.DataType;
import org.apache.flink.util.Preconditions;

import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
 * This class is responsible for calculating names and types of fields that are actual at a certain point in time.
 * If field is renamed in queried schema, its old name will be returned, which is relevant at the provided time.
 * If type of field is changed, its old type will be returned, and projection will be created that will convert the old type to the queried one.
 */
public class InternalSchemaManager implements Serializable {

  private static final long serialVersionUID = 1L;

  public static final InternalSchemaManager DISABLED = new InternalSchemaManager(null, InternalSchema.getEmptyInternalSchema(), null, null,
      TimelineLayout.fromVersion(TimelineLayoutVersion.CURR_LAYOUT_VERSION));

  private final Configuration conf;
  private final InternalSchema querySchema;
  private final String validCommits;
  private final String tablePath;
  private final TimelineLayout layout;
  private transient org.apache.hadoop.conf.Configuration hadoopConf;

  public static InternalSchemaManager get(Configuration conf, HoodieTableMetaClient metaClient) {
    if (!OptionsResolver.isSchemaEvolutionEnabled(conf)) {
      return DISABLED;
    }
    Option internalSchema = new TableSchemaResolver(metaClient).getTableInternalSchemaFromCommitMetadata();
    if (!internalSchema.isPresent() || internalSchema.get().isEmptySchema()) {
      return DISABLED;
    }

    InstantFileNameGenerator factory = metaClient.getInstantFileNameGenerator();
    String validCommits = metaClient
        .getCommitsAndCompactionTimeline()
        .filterCompletedInstants()
        .getInstantsAsStream()
        .map(factory::getFileName)
        .collect(Collectors.joining(","));
    return new InternalSchemaManager(conf, internalSchema.get(), validCommits, metaClient.getBasePath().toString(), metaClient.getTimelineLayout());
  }

  public InternalSchemaManager(Configuration conf, InternalSchema querySchema, String validCommits, String tablePath,
                               TimelineLayout layout) {
    this.conf = conf;
    this.querySchema = querySchema;
    this.validCommits = validCommits;
    this.tablePath = tablePath;
    this.layout = layout;
  }

  public InternalSchema getQuerySchema() {
    return querySchema;
  }

  /**
   * Attempts to merge the file and query schema to produce a mergeSchema, prioritising the use of fileSchema types.
   * An emptySchema is returned if:
   * 
    *
  • 1. An empty querySchema is provided
  • *
  • 2. querySchema is equal to fileSchema
  • *
* Note that this method returns an emptySchema if merging is not required to be performed. * @param fileName Name of file to fetch commitTime/versionId for * @return mergeSchema, i.e. the schema on which the file should be read with */ InternalSchema getMergeSchema(String fileName) { if (querySchema.isEmptySchema()) { return querySchema; } long commitInstantTime = Long.parseLong(FSUtils.getCommitTime(fileName)); InternalSchema fileSchema = InternalSchemaCache.getInternalSchemaByVersionId( commitInstantTime, tablePath, new HoodieHadoopStorage(tablePath, getHadoopConf()), validCommits, layout.getInstantFileNameParser(), layout.getCommitMetadataSerDe(), layout.getInstantGenerator()); if (querySchema.equals(fileSchema)) { return InternalSchema.getEmptyInternalSchema(); } return new InternalSchemaMerger(fileSchema, querySchema, true, true).mergeSchema(); } /** * This method returns a mapping of columns that have type inconsistencies between the mergeSchema and querySchema. * This is done by: *
  • 1. Finding the columns with type changes
  • *
  • 2. Get a map storing the index of these columns with type changes; Map of -> (colIdxInQueryFieldNames, colIdxInQuerySchema)
  • *
  • 3. For each selectedField with type changes, build a castMap containing the cast/conversion details; * Map of -> (selectedPos, Cast([from] fileType, [to] queryType))
  • * * @param mergeSchema InternalSchema representation of mergeSchema (prioritise use of fileSchemaType) that is used for reading base parquet files * @param queryFieldNames array containing the columns of a Hudi Flink table * @param queryFieldTypes array containing the field types of the columns of a Hudi Flink table * @param selectedFields array containing the index of the columns of interest required (indexes are based on queryFieldNames and queryFieldTypes) * @return a castMap containing the information of how to cast a selectedField from the fileType to queryType. * * @see CastMap */ CastMap getCastMap(InternalSchema mergeSchema, String[] queryFieldNames, DataType[] queryFieldTypes, int[] selectedFields) { Preconditions.checkArgument(!querySchema.isEmptySchema(), "querySchema cannot be empty"); Preconditions.checkArgument(!mergeSchema.isEmptySchema(), "mergeSchema cannot be empty"); CastMap castMap = new CastMap(); // map storing the indexes of columns with type changes Map of -> (colIdxInQueryFieldNames, colIdxInQuerySchema) Map posProxy = getPosProxy(mergeSchema, queryFieldNames); if (posProxy.isEmpty()) { // no type changes castMap.setFileFieldTypes(queryFieldTypes); return castMap; } List selectedFieldList = IntStream.of(selectedFields).boxed().collect(Collectors.toList()); // mergeSchema is built with useColumnTypeFromFileSchema = true List mergeSchemaAsDataTypes = AvroSchemaConverter.convertToDataType( AvroInternalSchemaConverter.convert(mergeSchema, "tableName")).getChildren(); DataType[] fileFieldTypes = new DataType[queryFieldTypes.length]; for (int i = 0; i < queryFieldTypes.length; i++) { // position of ChangedType in querySchema Integer posOfChangedType = posProxy.get(i); if (posOfChangedType == null) { // no type change for column; fileFieldType == queryFieldType fileFieldTypes[i] = queryFieldTypes[i]; } else { // type change detected for column; DataType fileType = mergeSchemaAsDataTypes.get(posOfChangedType); // update fileFieldType match the type found in mergeSchema fileFieldTypes[i] = fileType; int selectedPos = selectedFieldList.indexOf(i); if (selectedPos != -1) { // if the column is part of user's query, add it into the castMap // castMap -> (position, Cast([from] fileType, [to] queryType)) castMap.add(selectedPos, fileType.getLogicalType(), queryFieldTypes[i].getLogicalType()); } } } castMap.setFileFieldTypes(fileFieldTypes); return castMap; } /** * For columns that have been modified via the column renaming operation, the column name might be inconsistent * between querySchema and mergeSchema. *

    * As such, this method will identify all columns that have been renamed, and return a string array of column names * corresponding to the column names found in the mergeSchema. *

    * This is done by: *

  • 1. Get the rename mapping of -> (colNameFromNewSchema, colNameLastPartFromOldSchema)
  • *
  • 2. For columns that have been renamed, replace them with the old column name
  • * * @param mergeSchema InternalSchema representation of mergeSchema (prioritise use of fileSchemaType) that is used for reading base parquet files * @param queryFieldNames array containing the columns of a Hudi Flink table * @return String array containing column names corresponding to the column names found in the mergeSchema * * @see InternalSchemaUtils#collectRenameCols(InternalSchema, InternalSchema) */ String[] getMergeFieldNames(InternalSchema mergeSchema, String[] queryFieldNames) { Preconditions.checkArgument(!querySchema.isEmptySchema(), "querySchema cannot be empty"); Preconditions.checkArgument(!mergeSchema.isEmptySchema(), "mergeSchema cannot be empty"); Map renamedCols = InternalSchemaUtils.collectRenameCols(mergeSchema, querySchema); if (renamedCols.isEmpty()) { return queryFieldNames; } return Arrays.stream(queryFieldNames).map(name -> renamedCols.getOrDefault(name, name)).toArray(String[]::new); } private Map getPosProxy(InternalSchema mergeSchema, String[] queryFieldNames) { Map> changedCols = InternalSchemaUtils.collectTypeChangedCols(querySchema, mergeSchema); HashMap posProxy = new HashMap<>(changedCols.size()); List fieldNameList = Arrays.asList(queryFieldNames); List columns = querySchema.columns(); changedCols.forEach((posInSchema, typePair) -> { String name = columns.get(posInSchema).name(); int posInType = fieldNameList.indexOf(name); posProxy.put(posInType, posInSchema); }); return Collections.unmodifiableMap(posProxy); } private org.apache.hadoop.conf.Configuration getHadoopConf() { if (hadoopConf == null) { hadoopConf = HadoopConfigurations.getHadoopConf(conf); } return hadoopConf; } }




    © 2015 - 2025 Weber Informatics LLC | Privacy Policy