org.apache.hudi.table.format.InternalSchemaManager Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-flink1.16-bundle Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.table.format;

import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.common.table.timeline.InstantFileNameGenerator;
import org.apache.hudi.common.table.timeline.TimelineLayout;
import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
import org.apache.hudi.common.util.InternalSchemaCache;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.configuration.HadoopConfigurations;
import org.apache.hudi.configuration.OptionsResolver;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.Type;
import org.apache.hudi.internal.schema.Types;
import org.apache.hudi.internal.schema.action.InternalSchemaMerger;
import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
import org.apache.hudi.internal.schema.utils.InternalSchemaUtils;
import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;
import org.apache.hudi.util.AvroSchemaConverter;

import org.apache.flink.configuration.Configuration;
import org.apache.flink.table.types.DataType;
import org.apache.flink.util.Preconditions;

import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
 * This class is responsible for calculating names and types of fields that are actual at a certain point in time.
 * If field is renamed in queried schema, its old name will be returned, which is relevant at the provided time.
 * If type of field is changed, its old type will be returned, and projection will be created that will convert the old type to the queried one.
 */
public class InternalSchemaManager implements Serializable {

  private static final long serialVersionUID = 1L;

  public static final InternalSchemaManager DISABLED = new InternalSchemaManager(null, InternalSchema.getEmptyInternalSchema(), null, null,
      TimelineLayout.fromVersion(TimelineLayoutVersion.CURR_LAYOUT_VERSION));

  private final Configuration conf;
  private final InternalSchema querySchema;
  private final String validCommits;
  private final String tablePath;
  private final TimelineLayout layout;
  private transient org.apache.hadoop.conf.Configuration hadoopConf;

  public static InternalSchemaManager get(Configuration conf, HoodieTableMetaClient metaClient) {
    if (!OptionsResolver.isSchemaEvolutionEnabled(conf)) {
      return DISABLED;
    }
    Option internalSchema = new TableSchemaResolver(metaClient).getTableInternalSchemaFromCommitMetadata();
    if (!internalSchema.isPresent() || internalSchema.get().isEmptySchema()) {
      return DISABLED;
    }

    InstantFileNameGenerator factory = metaClient.getInstantFileNameGenerator();
    String validCommits = metaClient
        .getCommitsAndCompactionTimeline()
        .filterCompletedInstants()
        .getInstantsAsStream()
        .map(factory::getFileName)
        .collect(Collectors.joining(","));
    return new InternalSchemaManager(conf, internalSchema.get(), validCommits, metaClient.getBasePath().toString(), metaClient.getTimelineLayout());
  }

  public InternalSchemaManager(Configuration conf, InternalSchema querySchema, String validCommits, String tablePath,
                               TimelineLayout layout) {
    this.conf = conf;
    this.querySchema = querySchema;
    this.validCommits = validCommits;
    this.tablePath = tablePath;
    this.layout = layout;
  }

  public InternalSchema getQuerySchema() {
    return querySchema;
  }

  /**
   * Attempts to merge the file and query schema to produce a mergeSchema, prioritising the use of fileSchema types.
   * An emptySchema is returned if:
   * 
   * 1. An empty querySchema is provided
   * 2. querySchema is equal to fileSchema
   * 
   * Note that this method returns an emptySchema if merging is not required to be performed.
   * @param fileName Name of file to fetch commitTime/versionId for
   * @return mergeSchema, i.e. the schema on which the file should be read with
   */
  InternalSchema getMergeSchema(String fileName) {
    if (querySchema.isEmptySchema()) {
      return querySchema;
    }
    long commitInstantTime = Long.parseLong(FSUtils.getCommitTime(fileName));
    InternalSchema fileSchema = InternalSchemaCache.getInternalSchemaByVersionId(
        commitInstantTime, tablePath,
        new HoodieHadoopStorage(tablePath, getHadoopConf()),
        validCommits, layout.getInstantFileNameParser(),
        layout.getCommitMetadataSerDe(), layout.getInstantGenerator());
    if (querySchema.equals(fileSchema)) {
      return InternalSchema.getEmptyInternalSchema();
    }
    return new InternalSchemaMerger(fileSchema, querySchema, true, true).mergeSchema();
  }

  /**
   * This method returns a mapping of columns that have type inconsistencies between the mergeSchema and querySchema.
   * This is done by:
   * 1. Finding the columns with type changes
   * 2. Get a map storing the index of these columns with type changes; Map of -> (colIdxInQueryFieldNames, colIdxInQuerySchema)
   * 3. For each selectedField with type changes, build a castMap containing the cast/conversion details;
   * Map of -> (selectedPos, Cast([from] fileType, [to] queryType))
   *
   * @param mergeSchema InternalSchema representation of mergeSchema (prioritise use of fileSchemaType) that is used for reading base parquet files
   * @param queryFieldNames array containing the columns of a Hudi Flink table
   * @param queryFieldTypes array containing the field types of the columns of a Hudi Flink table
   * @param selectedFields array containing the index of the columns of interest required (indexes are based on queryFieldNames and queryFieldTypes)
   * @return a castMap containing the information of how to cast a selectedField from the fileType to queryType.
   *
   * @see CastMap
   */
  CastMap getCastMap(InternalSchema mergeSchema, String[] queryFieldNames, DataType[] queryFieldTypes, int[] selectedFields) {
    Preconditions.checkArgument(!querySchema.isEmptySchema(), "querySchema cannot be empty");
    Preconditions.checkArgument(!mergeSchema.isEmptySchema(), "mergeSchema cannot be empty");

    CastMap castMap = new CastMap();
    // map storing the indexes of columns with type changes Map of -> (colIdxInQueryFieldNames, colIdxInQuerySchema)
    Map posProxy = getPosProxy(mergeSchema, queryFieldNames);
    if (posProxy.isEmpty()) {
      // no type changes
      castMap.setFileFieldTypes(queryFieldTypes);
      return castMap;
    }
    List selectedFieldList = IntStream.of(selectedFields).boxed().collect(Collectors.toList());
    // mergeSchema is built with useColumnTypeFromFileSchema = true
    List mergeSchemaAsDataTypes = AvroSchemaConverter.convertToDataType(
        AvroInternalSchemaConverter.convert(mergeSchema, "tableName")).getChildren();
    DataType[] fileFieldTypes = new DataType[queryFieldTypes.length];
    for (int i = 0; i < queryFieldTypes.length; i++) {
      // position of ChangedType in querySchema
      Integer posOfChangedType = posProxy.get(i);
      if (posOfChangedType == null) {
        // no type change for column; fileFieldType == queryFieldType
        fileFieldTypes[i] = queryFieldTypes[i];
      } else {
        // type change detected for column;
        DataType fileType = mergeSchemaAsDataTypes.get(posOfChangedType);
        // update fileFieldType match the type found in mergeSchema
        fileFieldTypes[i] = fileType;
        int selectedPos = selectedFieldList.indexOf(i);
        if (selectedPos != -1) {
          // if the column is part of user's query, add it into the castMap
          // castMap -> (position, Cast([from] fileType, [to] queryType))
          castMap.add(selectedPos, fileType.getLogicalType(), queryFieldTypes[i].getLogicalType());
        }
      }
    }
    castMap.setFileFieldTypes(fileFieldTypes);
    return castMap;
  }

  /**
   * For columns that have been modified via the column renaming operation, the column name might be inconsistent
   * between querySchema and mergeSchema.
   * 
   * As such, this method will identify all columns that have been renamed, and return a string array of column names
   * corresponding to the column names found in the mergeSchema.
   * 

   * This is done by:
   * 
1. Get the rename mapping of -> (colNameFromNewSchema, colNameLastPartFromOldSchema)
   * 2. For columns that have been renamed, replace them with the old column name
   *
   * @param mergeSchema InternalSchema representation of mergeSchema (prioritise use of fileSchemaType) that is used for reading base parquet files
   * @param queryFieldNames array containing the columns of a Hudi Flink table
   * @return String array containing column names corresponding to the column names found in the mergeSchema
   *
   * @see InternalSchemaUtils#collectRenameCols(InternalSchema, InternalSchema)
   */
  String[] getMergeFieldNames(InternalSchema mergeSchema, String[] queryFieldNames) {
    Preconditions.checkArgument(!querySchema.isEmptySchema(), "querySchema cannot be empty");
    Preconditions.checkArgument(!mergeSchema.isEmptySchema(), "mergeSchema cannot be empty");

    Map renamedCols = InternalSchemaUtils.collectRenameCols(mergeSchema, querySchema);
    if (renamedCols.isEmpty()) {
      return queryFieldNames;
    }
    return Arrays.stream(queryFieldNames).map(name -> renamedCols.getOrDefault(name, name)).toArray(String[]::new);
  }

  private Map getPosProxy(InternalSchema mergeSchema, String[] queryFieldNames) {
    Map> changedCols = InternalSchemaUtils.collectTypeChangedCols(querySchema, mergeSchema);
    HashMap posProxy = new HashMap<>(changedCols.size());
    List fieldNameList = Arrays.asList(queryFieldNames);
    List columns = querySchema.columns();
    changedCols.forEach((posInSchema, typePair) -> {
      String name = columns.get(posInSchema).name();
      int posInType = fieldNameList.indexOf(name);
      posProxy.put(posInType, posInSchema);
    });
    return Collections.unmodifiableMap(posProxy);
  }

  private org.apache.hadoop.conf.Configuration getHadoopConf() {
    if (hadoopConf == null) {
      hadoopConf = HadoopConfigurations.getHadoopConf(conf);
    }
    return hadoopConf;
  }
}