org.apache.hudi.hadoop.SchemaEvolutionContext Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-spark3.5-bundle_2.13 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.hadoop;

import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.common.util.InternalSchemaCache;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.TablePathUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieValidationException;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat;
import org.apache.hudi.hadoop.realtime.AbstractRealtimeRecordReader;
import org.apache.hudi.hadoop.realtime.RealtimeSplit;
import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.Type;
import org.apache.hudi.internal.schema.Types;
import org.apache.hudi.internal.schema.action.InternalSchemaMerger;
import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
import org.apache.hudi.internal.schema.utils.InternalSchemaUtils;
import org.apache.hudi.internal.schema.utils.SerDeHelper;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;

import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.exec.SerializationUtilities;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;

import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToStoragePath;

/**
 * This class is responsible for calculating names and types of fields that are actual at a certain point in time for hive.
 * If field is renamed in queried schema, its old name will be returned, which is relevant at the provided time.
 * If type of field is changed, its old type will be returned, and projection will be created that will convert the old type to the queried one.
 */
public class SchemaEvolutionContext {

  private static final Logger LOG = LoggerFactory.getLogger(SchemaEvolutionContext.class);

  private static final String HIVE_TMP_READ_COLUMN_NAMES_CONF_STR = "hive.tmp.io.file.readcolumn.ids";
  private static final String HIVE_TMP_COLUMNS = "hive.tmp.columns";
  private static final String HIVE_EVOLUTION_ENABLE = "hudi.hive.schema.evolution";

  private final InputSplit split;
  private final JobConf job;
  private final HoodieTableMetaClient metaClient;
  public Option internalSchemaOption;

  public SchemaEvolutionContext(InputSplit split, JobConf job) throws IOException {
    this(split, job, Option.empty());
  }

  public SchemaEvolutionContext(InputSplit split, JobConf job, Option metaClientOption) throws IOException {
    this.split = split;
    this.job = job;
    if (!job.getBoolean(HIVE_EVOLUTION_ENABLE, true)) {
      LOG.info("Schema evolution is disabled for split: {}", split);
      internalSchemaOption = Option.empty();
      this.metaClient = null;
      return;
    }
    this.metaClient = metaClientOption.isPresent() ? metaClientOption.get() : setUpHoodieTableMetaClient();
    this.internalSchemaOption = getInternalSchemaFromCache();
  }

  public Option getInternalSchemaFromCache() throws IOException {
    Option internalSchemaOpt = getCachedData(
        HoodieCombineHiveInputFormat.INTERNAL_SCHEMA_CACHE_KEY_PREFIX,
        SerDeHelper::fromJson);
    if (internalSchemaOpt == null) {
      // the code path should only be invoked in tests.
      return new TableSchemaResolver(this.metaClient).getTableInternalSchemaFromCommitMetadata();
    }
    return internalSchemaOpt;
  }

  public Schema getAvroSchemaFromCache() throws Exception {
    Option avroSchemaOpt = getCachedData(
        HoodieCombineHiveInputFormat.SCHEMA_CACHE_KEY_PREFIX,
        json -> Option.ofNullable(new Schema.Parser().parse(json)));
    if (avroSchemaOpt == null) {
      // the code path should only be invoked in tests.
      return new TableSchemaResolver(this.metaClient).getTableAvroSchema();
    }
    return avroSchemaOpt.orElseThrow(() -> new HoodieValidationException("The avro schema cache should always be set up together with the internal schema cache"));
  }

  /**
   * Returns the cache data with given key or null if the cache was never set up.
   */
  @Nullable
  private  Option getCachedData(String keyPrefix, Function> parser) throws IOException {
    Option tablePath = getTablePath(job, split);
    if (!tablePath.isPresent()) {
      return Option.empty();
    }
    String cacheKey = keyPrefix + "." + tablePath.get().toUri();
    String cachedJson = job.get(cacheKey);
    if (cachedJson == null) {
      // the code path should only be invoked in tests.
      return null;
    }
    if (cachedJson.isEmpty()) {
      return Option.empty();
    }
    try {
      return parser.apply(cachedJson);
    } catch (Exception e) {
      LOG.warn("Failed to parse data from cache with key: {}", cacheKey, e);
      return Option.empty();
    }
  }

  private Option getTablePath(JobConf job, InputSplit split) throws IOException {
    if (split instanceof FileSplit) {
      Path path = ((FileSplit) split).getPath();
      FileSystem fs = path.getFileSystem(job);
      HoodieStorage storage = new HoodieHadoopStorage(fs);
      return TablePathUtils.getTablePath(storage, HadoopFSUtils.convertToStoragePath(path));
    }
    return Option.empty();
  }

  private HoodieTableMetaClient setUpHoodieTableMetaClient() {
    try {
      Path inputPath = ((FileSplit) split).getPath();
      FileSystem fs = inputPath.getFileSystem(job);
      HoodieStorage storage = new HoodieHadoopStorage(fs);
      Option tablePath = TablePathUtils.getTablePath(storage, convertToStoragePath(inputPath));
      return HoodieTableMetaClient.builder().setBasePath(tablePath.get().toString())
          .setConf(HadoopFSUtils.getStorageConfWithCopy(job)).build();
    } catch (Exception e) {
      LOG.warn(String.format("Not a valid hoodie table, table path: %s", ((FileSplit) split).getPath()), e);
      return null;
    }
  }

  /**
   * Do schema evolution for RealtimeInputFormat.
   *
   * @param realtimeRecordReader recordReader for RealtimeInputFormat.
   * @return
   */
  public void doEvolutionForRealtimeInputFormat(AbstractRealtimeRecordReader realtimeRecordReader) throws Exception {
    if (!(split instanceof RealtimeSplit)) {
      LOG.warn("expect realtime split for mor table, but find other type split {}", split);
      return;
    }
    if (internalSchemaOption.isPresent()) {
      Schema tableAvroSchema = getAvroSchemaFromCache();
      List requiredColumns = getRequireColumn(job);
      InternalSchema prunedInternalSchema = InternalSchemaUtils.pruneInternalSchema(internalSchemaOption.get(),
          requiredColumns);
      // Add partitioning fields to writer schema for resulting row to contain null values for these fields
      String partitionFields = job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "");
      List partitioningFields = !partitionFields.isEmpty() ? Arrays.stream(partitionFields.split("/")).collect(Collectors.toList())
          : new ArrayList<>();
      Schema writerSchema = AvroInternalSchemaConverter.convert(internalSchemaOption.get(), tableAvroSchema.getName());
      writerSchema = HoodieRealtimeRecordReaderUtils.addPartitionFields(writerSchema, partitioningFields);
      Map schemaFieldsMap = HoodieRealtimeRecordReaderUtils.getNameToFieldMap(writerSchema);
      // we should get HoodieParquetInputFormat#HIVE_TMP_COLUMNS,since serdeConstants#LIST_COLUMNS maybe change by HoodieParquetInputFormat#setColumnNameList
      Schema hiveSchema = realtimeRecordReader.constructHiveOrderedSchema(writerSchema, schemaFieldsMap, job.get(HIVE_TMP_COLUMNS));
      Schema readerSchema = AvroInternalSchemaConverter.convert(prunedInternalSchema, tableAvroSchema.getName());
      // setUp evolution schema
      realtimeRecordReader.setWriterSchema(writerSchema);
      realtimeRecordReader.setReaderSchema(readerSchema);
      realtimeRecordReader.setHiveSchema(hiveSchema);
      internalSchemaOption = Option.of(prunedInternalSchema);
      RealtimeSplit realtimeSplit = (RealtimeSplit) split;
      LOG.info("About to read compacted logs {} for base split {}, projecting cols {}",
          realtimeSplit.getDeltaLogPaths(), realtimeSplit.getPath(), requiredColumns);
    }
  }

  /**
   * Do schema evolution for ParquetFormat.
   */
  public void doEvolutionForParquetFormat() {
    if (internalSchemaOption.isPresent()) {
      // reading hoodie schema evolution table
      job.setBoolean(HIVE_EVOLUTION_ENABLE, true);
      Path finalPath = ((FileSplit) split).getPath();
      InternalSchema prunedSchema;
      List requiredColumns = getRequireColumn(job);
      // No need trigger schema evolution for count(*)/count(1) operation
      boolean disableSchemaEvolution =
          requiredColumns.isEmpty() || (requiredColumns.size() == 1 && requiredColumns.get(0).isEmpty());
      if (!disableSchemaEvolution) {
        prunedSchema = InternalSchemaUtils.pruneInternalSchema(internalSchemaOption.get(), requiredColumns);
        InternalSchema querySchema = prunedSchema;
        long commitTime = Long.parseLong(FSUtils.getCommitTime(finalPath.getName()));
        InternalSchema fileSchema = InternalSchemaCache.searchSchemaAndCache(commitTime, metaClient);
        InternalSchema mergedInternalSchema = new InternalSchemaMerger(fileSchema, querySchema, true,
            true).mergeSchema();
        List fields = mergedInternalSchema.columns();
        setColumnNameList(job, fields);
        setColumnTypeList(job, fields);
        pushDownFilter(job, querySchema, fileSchema);
      }
    }
  }

  public void setColumnTypeList(JobConf job, List fields) {
    List fullTypeInfos = TypeInfoUtils.getTypeInfosFromTypeString(job.get(serdeConstants.LIST_COLUMN_TYPES));
    List tmpColIdList = Arrays.stream(job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR).split(","))
        .map(Integer::parseInt).collect(Collectors.toList());
    if (tmpColIdList.size() != fields.size()) {
      throw new HoodieException(String.format("The size of hive.io.file.readcolumn.ids: %s is not equal to projection columns: %s",
          job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR), fields.stream().map(Types.Field::name).collect(Collectors.joining(","))));
    }
    List fieldTypes = new ArrayList<>();
    for (int i = 0; i < tmpColIdList.size(); i++) {
      Types.Field field = fields.get(i);
      TypeInfo typeInfo = TypeInfoUtils.getTypeInfosFromTypeString(fullTypeInfos.get(tmpColIdList.get(i)).getQualifiedName()).get(0);
      TypeInfo fieldType = constructHiveSchemaFromType(field.type(), typeInfo);
      fieldTypes.add(fieldType);
    }
    for (int i = 0; i < tmpColIdList.size(); i++) {
      TypeInfo typeInfo = fieldTypes.get(i);
      if (!(typeInfo instanceof PrimitiveTypeInfo)) {
        int index = tmpColIdList.get(i);
        fullTypeInfos.remove(index);
        fullTypeInfos.add(index, typeInfo);
      }
    }
    List fullColTypeList = TypeInfoUtils.getTypeStringsFromTypeInfo(fullTypeInfos);
    String fullColTypeListString = String.join(",", fullColTypeList);
    job.set(serdeConstants.LIST_COLUMN_TYPES, fullColTypeListString);
  }

  private TypeInfo constructHiveSchemaFromType(Type type, TypeInfo typeInfo) {
    switch (type.typeId()) {
      case RECORD:
        Types.RecordType record = (Types.RecordType) type;
        List fields = record.fields();
        ArrayList fieldTypes = new ArrayList<>();
        ArrayList fieldNames = new ArrayList<>();
        for (int index = 0; index < fields.size(); index++) {
          StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
          TypeInfo subTypeInfo = getSchemaSubTypeInfo(structTypeInfo.getAllStructFieldTypeInfos().get(index), fields.get(index).type());
          fieldTypes.add(subTypeInfo);
          String name = fields.get(index).name();
          fieldNames.add(name);
        }
        StructTypeInfo structTypeInfo = new StructTypeInfo();
        structTypeInfo.setAllStructFieldNames(fieldNames);
        structTypeInfo.setAllStructFieldTypeInfos(fieldTypes);
        return structTypeInfo;
      case ARRAY:
        ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo;
        Types.ArrayType array = (Types.ArrayType) type;
        TypeInfo subTypeInfo = getSchemaSubTypeInfo(listTypeInfo.getListElementTypeInfo(), array.elementType());
        listTypeInfo.setListElementTypeInfo(subTypeInfo);
        return listTypeInfo;
      case MAP:
        Types.MapType map = (Types.MapType) type;
        MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo;
        TypeInfo keyType = getSchemaSubTypeInfo(mapTypeInfo.getMapKeyTypeInfo(), map.keyType());
        TypeInfo valueType = getSchemaSubTypeInfo(mapTypeInfo.getMapValueTypeInfo(), map.valueType());
        MapTypeInfo mapType = new MapTypeInfo();
        mapType.setMapKeyTypeInfo(keyType);
        mapType.setMapValueTypeInfo(valueType);
        return mapType;
      case BOOLEAN:
      case INT:
      case LONG:
      case FLOAT:
      case DOUBLE:
      case DATE:
      case TIMESTAMP:
      case STRING:
      case UUID:
      case FIXED:
      case BINARY:
      case DECIMAL:
        return typeInfo;
      case TIME:
        throw new UnsupportedOperationException(String.format("cannot convert %s type to hive", type));
      default:
        LOG.error("cannot convert unknown type: {} to Hive", type);
        throw new UnsupportedOperationException(String.format("cannot convert unknown type: %s to Hive", type));
    }
  }

  private TypeInfo getSchemaSubTypeInfo(TypeInfo hoodieTypeInfo, Type hiveType) {
    TypeInfo subTypeInfo = TypeInfoUtils.getTypeInfosFromTypeString(hoodieTypeInfo.getQualifiedName()).get(0);
    TypeInfo typeInfo;
    if (subTypeInfo instanceof PrimitiveTypeInfo) {
      typeInfo = subTypeInfo;
    } else {
      typeInfo = constructHiveSchemaFromType(hiveType, subTypeInfo);
    }
    return typeInfo;
  }

  private void pushDownFilter(JobConf job, InternalSchema querySchema, InternalSchema fileSchema) {
    String filterExprSerialized = job.get(TableScanDesc.FILTER_EXPR_CONF_STR);
    if (filterExprSerialized != null) {
      ExprNodeGenericFuncDesc filterExpr = SerializationUtilities.deserializeExpression(filterExprSerialized);
      LinkedList exprNodes = new LinkedList();
      exprNodes.add(filterExpr);
      while (!exprNodes.isEmpty()) {
        int size = exprNodes.size();
        for (int i = 0; i < size; i++) {
          ExprNodeDesc expr = exprNodes.poll();
          if (expr instanceof ExprNodeColumnDesc) {
            String oldColumn = ((ExprNodeColumnDesc) expr).getColumn();
            String newColumn = InternalSchemaUtils.reBuildFilterName(oldColumn, fileSchema, querySchema);
            ((ExprNodeColumnDesc) expr).setColumn(newColumn);
          }
          List children = expr.getChildren();
          if (children != null) {
            exprNodes.addAll(children);
          }
        }
      }
      String filterText = filterExpr.getExprString();
      String serializedFilterExpr = SerializationUtilities.serializeExpression(filterExpr);
      if (LOG.isDebugEnabled()) {
        LOG.debug("Pushdown initiated with filterText = {}, filterExpr = {}, serializedFilterExpr = {}",
            filterText, filterExpr, serializedFilterExpr);
      }
      job.set(TableScanDesc.FILTER_TEXT_CONF_STR, filterText);
      job.set(TableScanDesc.FILTER_EXPR_CONF_STR, serializedFilterExpr);
    }
  }

  private void setColumnNameList(JobConf job, List fields) {
    if (fields == null) {
      return;
    }
    List tmpColIdList = Arrays.asList(job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR).split(","));
    if (fields.size() != tmpColIdList.size()) {
      return;
    }
    StringBuilder readColumnNames = new StringBuilder();
    List tmpColNameList = Arrays.asList(job.get(serdeConstants.LIST_COLUMNS).split(","));
    List fullColNamelist = new ArrayList<>(tmpColNameList);
    for (int index = 0; index < fields.size(); index++) {
      String colName = fields.get(index).name();
      if (readColumnNames.length() > 0) {
        readColumnNames.append(',');
      }
      readColumnNames.append(colName);
      int id = Integer.parseInt(tmpColIdList.get(index));
      if (!colName.equals(fullColNamelist.get(id))) {
        fullColNamelist.remove(id);
        fullColNamelist.add(id, colName);
      }
    }
    String readColumnNamesString = readColumnNames.toString();
    job.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColumnNamesString);
    String fullColNamelistString = String.join(",", fullColNamelist);
    job.set(serdeConstants.LIST_COLUMNS, fullColNamelistString);
  }

  public static List getRequireColumn(JobConf jobConf) {
    String originColumnString = jobConf.get(HIVE_TMP_READ_COLUMN_NAMES_CONF_STR);
    if (StringUtils.isNullOrEmpty(originColumnString)) {
      jobConf.set(HIVE_TMP_READ_COLUMN_NAMES_CONF_STR, jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR));
    }
    String hoodieFullColumnString = jobConf.get(HIVE_TMP_COLUMNS);
    if (StringUtils.isNullOrEmpty(hoodieFullColumnString)) {
      jobConf.set(HIVE_TMP_COLUMNS, jobConf.get(serdeConstants.LIST_COLUMNS));
    }
    String tableColumnString = jobConf.get(HIVE_TMP_READ_COLUMN_NAMES_CONF_STR);
    List tableColumns = Arrays.asList(tableColumnString.split(","));
    return new ArrayList<>(tableColumns);
  }
}