All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.common.table.TableSchemaResolver Maven / Gradle / Ivy

There is a newer version: 1.0.0-beta1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.common.table;

import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.log.HoodieLogFormat;
import org.apache.hudi.common.table.log.HoodieLogFormat.Reader;
import org.apache.hudi.common.table.log.block.HoodieDataBlock;
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieIncompatibleSchemaException;
import org.apache.hudi.exception.InvalidTableException;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager;
import org.apache.hudi.internal.schema.utils.SerDeHelper;
import org.apache.hudi.io.storage.HoodieAvroHFileReader;
import org.apache.hudi.io.storage.HoodieAvroOrcReader;
import org.apache.hudi.util.Lazy;

import org.apache.avro.JsonProperties;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;

import javax.annotation.concurrent.ThreadSafe;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;

import static org.apache.hudi.avro.AvroSchemaUtils.appendFieldsToSchema;
import static org.apache.hudi.avro.AvroSchemaUtils.containsFieldInSchema;
import static org.apache.hudi.avro.AvroSchemaUtils.createNullableSchema;

/**
 * Helper class to read schema from data files and log files and to convert it between different formats.
 */
@ThreadSafe
public class TableSchemaResolver {

  private static final Logger LOG = LogManager.getLogger(TableSchemaResolver.class);

  private final HoodieTableMetaClient metaClient;

  /**
   * Signals whether suite of the meta-fields should have additional field designating
   * operation particular record was added by. Note, that determining whether this meta-field
   * should be appended to the schema requires reading out the actual schema of some data file,
   * since it's ultimately the source of truth whether this field has to be represented in
   * the schema
   */
  private final Lazy hasOperationField;

  /**
   * NOTE: {@link HoodieCommitMetadata} could be of non-trivial size for large tables (in 100s of Mbs)
   *       and therefore we'd want to limit amount of throw-away work being performed while fetching
   *       commits' metadata
   *
   *       Please check out corresponding methods to fetch commonly used instances of {@link HoodieCommitMetadata}:
   *       {@link #getLatestCommitMetadataWithValidSchema()},
   *       {@link #getLatestCommitMetadataWithValidSchema()},
   *       {@link #getCachedCommitMetadata(HoodieInstant)}
   */
  private final Lazy> commitMetadataCache;

  private volatile HoodieInstant latestCommitWithValidSchema = null;
  private volatile HoodieInstant latestCommitWithValidData = null;

  public TableSchemaResolver(HoodieTableMetaClient metaClient) {
    this.metaClient = metaClient;
    this.commitMetadataCache = Lazy.lazily(() -> new ConcurrentHashMap<>(2));
    this.hasOperationField = Lazy.lazily(this::hasOperationField);
  }

  public Schema getTableAvroSchemaFromDataFile() {
    return convertParquetSchemaToAvro(getTableParquetSchemaFromDataFile());
  }

  /**
   * Gets full schema (user + metadata) for a hoodie table in Avro format.
   *
   * @return Avro schema for this table
   * @throws Exception
   */
  public Schema getTableAvroSchema() throws Exception {
    return getTableAvroSchema(metaClient.getTableConfig().populateMetaFields());
  }

  /**
   * Gets schema for a hoodie table in Avro format, can choice if include metadata fields.
   *
   * @param includeMetadataFields choice if include metadata fields
   * @return Avro schema for this table
   * @throws Exception
   */
  public Schema getTableAvroSchema(boolean includeMetadataFields) throws Exception {
    return getTableAvroSchemaInternal(includeMetadataFields, Option.empty());
  }

  /**
   * Fetches tables schema in Avro format as of the given instant
   *
   * @param timestamp as of which table's schema will be fetched
   */
  public Schema getTableAvroSchema(String timestamp) throws Exception {
    Option instant = metaClient.getActiveTimeline().getCommitsTimeline()
        .filterCompletedInstants()
        .findInstantsBeforeOrEquals(timestamp)
        .lastInstant();
    return getTableAvroSchemaInternal(metaClient.getTableConfig().populateMetaFields(), instant);
  }

  /**
   * Fetches tables schema in Avro format as of the given instant
   *
   * @param instant as of which table's schema will be fetched
   */
  public Schema getTableAvroSchema(HoodieInstant instant, boolean includeMetadataFields) throws Exception {
    return getTableAvroSchemaInternal(includeMetadataFields, Option.of(instant));
  }

  /**
   * Gets full schema (user + metadata) for a hoodie table in Parquet format.
   *
   * @return Parquet schema for the table
   */
  public MessageType getTableParquetSchema() throws Exception {
    return convertAvroSchemaToParquet(getTableAvroSchema(true));
  }

  /**
   * Gets users data schema for a hoodie table in Parquet format.
   *
   * @return Parquet schema for the table
   */
  public MessageType getTableParquetSchema(boolean includeMetadataField) throws Exception {
    return convertAvroSchemaToParquet(getTableAvroSchema(includeMetadataField));
  }

  /**
   * Gets users data schema for a hoodie table in Avro format.
   *
   * @return  Avro user data schema
   * @throws Exception
   *
   * @deprecated use {@link #getTableAvroSchema(boolean)} instead
   */
  @Deprecated
  public Schema getTableAvroSchemaWithoutMetadataFields() throws Exception {
    return getTableAvroSchema(false);
  }

  private Schema getTableAvroSchemaInternal(boolean includeMetadataFields, Option instantOpt) {
    Schema schema =
        (instantOpt.isPresent()
            ? getTableSchemaFromCommitMetadata(instantOpt.get(), includeMetadataFields)
            : getTableSchemaFromLatestCommitMetadata(includeMetadataFields))
            .or(() ->
                metaClient.getTableConfig().getTableCreateSchema()
                    .map(tableSchema ->
                        includeMetadataFields
                            ? HoodieAvroUtils.addMetadataFields(tableSchema, hasOperationField.get())
                            : tableSchema)
            )
            .orElseGet(() -> {
              Schema schemaFromDataFile = getTableAvroSchemaFromDataFile();
              return includeMetadataFields
                  ? schemaFromDataFile
                  : HoodieAvroUtils.removeMetadataFields(schemaFromDataFile);
            });

    // TODO partition columns have to be appended in all read-paths
    if (metaClient.getTableConfig().shouldDropPartitionColumns()) {
      return metaClient.getTableConfig().getPartitionFields()
          .map(partitionFields -> appendPartitionColumns(schema, Option.ofNullable(partitionFields)))
          .orElse(schema);
    }

    return schema;
  }

  private Option getTableSchemaFromLatestCommitMetadata(boolean includeMetadataFields) {
    Option> instantAndCommitMetadata = getLatestCommitMetadataWithValidSchema();
    if (instantAndCommitMetadata.isPresent()) {
      HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
      String schemaStr = commitMetadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY);
      Schema schema = new Schema.Parser().parse(schemaStr);
      if (includeMetadataFields) {
        schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField.get());
      }
      return Option.of(schema);
    } else {
      return Option.empty();
    }
  }

  private Option getTableSchemaFromCommitMetadata(HoodieInstant instant, boolean includeMetadataFields) {
    try {
      HoodieCommitMetadata metadata = getCachedCommitMetadata(instant);
      String existingSchemaStr = metadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY);

      if (StringUtils.isNullOrEmpty(existingSchemaStr)) {
        return Option.empty();
      }

      Schema schema = new Schema.Parser().parse(existingSchemaStr);
      if (includeMetadataFields) {
        schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField.get());
      }
      return Option.of(schema);
    } catch (Exception e) {
      throw new HoodieException("Failed to read schema from commit metadata", e);
    }
  }

  /**
   * Fetches the schema for a table from any the table's data files
   */
  private MessageType getTableParquetSchemaFromDataFile() {
    Option> instantAndCommitMetadata = getLatestCommitMetadataWithValidData();
    try {
      switch (metaClient.getTableType()) {
        case COPY_ON_WRITE:
        case MERGE_ON_READ:
          // For COW table, data could be written in either Parquet or Orc format currently;
          // For MOR table, data could be written in either Parquet, Orc, Hfile or Delta-log format currently;
          //
          // Determine the file format based on the file name, and then extract schema from it.
          if (instantAndCommitMetadata.isPresent()) {
            HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
            Iterator filePaths = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePathV2()).values().iterator();
            return fetchSchemaFromFiles(filePaths);
          } else {
            throw new IllegalArgumentException("Could not find any data file written for commit, "
                + "so could not get schema for table " + metaClient.getBasePath());
          }
        default:
          LOG.error("Unknown table type " + metaClient.getTableType());
          throw new InvalidTableException(metaClient.getBasePath());
      }
    } catch (IOException e) {
      throw new HoodieException("Failed to read data schema", e);
    }
  }

  public static MessageType convertAvroSchemaToParquet(Schema schema, Configuration hadoopConf) {
    AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(hadoopConf);
    return avroSchemaConverter.convert(schema);
  }

  private Schema convertParquetSchemaToAvro(MessageType parquetSchema) {
    AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(metaClient.getHadoopConf());
    return avroSchemaConverter.convert(parquetSchema);
  }

  private MessageType convertAvroSchemaToParquet(Schema schema) {
    AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(metaClient.getHadoopConf());
    return avroSchemaConverter.convert(schema);
  }

  /**
   * Returns table's latest Avro {@link Schema} iff table is non-empty (ie there's at least
   * a single commit)
   *
   * This method differs from {@link #getTableAvroSchema(boolean)} in that it won't fallback
   * to use table's schema used at creation
   */
  public Option getTableAvroSchemaFromLatestCommit(boolean includeMetadataFields) throws Exception {
    if (metaClient.isTimelineNonEmpty()) {
      return Option.of(getTableAvroSchemaInternal(includeMetadataFields, Option.empty()));
    }

    return Option.empty();
  }

  private MessageType readSchemaFromParquetBaseFile(Path parquetFilePath) throws IOException {
    LOG.info("Reading schema from " + parquetFilePath);

    FileSystem fs = metaClient.getRawFs();
    ParquetMetadata fileFooter =
        ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER);
    return fileFooter.getFileMetaData().getSchema();
  }

  private MessageType readSchemaFromHFileBaseFile(Path hFilePath) throws IOException {
    LOG.info("Reading schema from " + hFilePath);

    FileSystem fs = metaClient.getRawFs();
    CacheConfig cacheConfig = new CacheConfig(fs.getConf());
    HoodieAvroHFileReader hFileReader = new HoodieAvroHFileReader(fs.getConf(), hFilePath, cacheConfig);
    return convertAvroSchemaToParquet(hFileReader.getSchema());
  }

  private MessageType readSchemaFromORCBaseFile(Path orcFilePath) throws IOException {
    LOG.info("Reading schema from " + orcFilePath);

    FileSystem fs = metaClient.getRawFs();
    HoodieAvroOrcReader orcReader = new HoodieAvroOrcReader(fs.getConf(), orcFilePath);
    return convertAvroSchemaToParquet(orcReader.getSchema());
  }

  /**
   * Read schema from a data file from the last compaction commit done.
   *
   * @deprecated please use {@link #getTableAvroSchema(HoodieInstant, boolean)} instead
   */
  public MessageType readSchemaFromLastCompaction(Option lastCompactionCommitOpt) throws Exception {
    HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();

    HoodieInstant lastCompactionCommit = lastCompactionCommitOpt.orElseThrow(() -> new Exception(
        "Could not read schema from last compaction, no compaction commits found on path " + metaClient));

    // Read from the compacted file wrote
    HoodieCommitMetadata compactionMetadata = HoodieCommitMetadata
        .fromBytes(activeTimeline.getInstantDetails(lastCompactionCommit).get(), HoodieCommitMetadata.class);
    String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePathV2()).values().stream().findAny()
        .orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for compaction "
            + lastCompactionCommit + ", could not get schema for table " + metaClient.getBasePath()));
    return readSchemaFromBaseFile(filePath);
  }

  private MessageType readSchemaFromLogFile(Path path) throws IOException {
    return readSchemaFromLogFile(metaClient.getRawFs(), path);
  }

  /**
   * Read the schema from the log file on path.
   *
   * @return
   */
  public static MessageType readSchemaFromLogFile(FileSystem fs, Path path) throws IOException {
    try (Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null)) {
      HoodieDataBlock lastBlock = null;
      while (reader.hasNext()) {
        HoodieLogBlock block = reader.next();
        if (block instanceof HoodieDataBlock) {
          lastBlock = (HoodieDataBlock) block;
        }
      }
      return lastBlock != null ? new AvroSchemaConverter().convert(lastBlock.getSchema()) : null;
    }
  }

  /**
   * Gets the InternalSchema for a hoodie table from the HoodieCommitMetadata of the instant.
   *
   * @return InternalSchema for this table
   */
  public Option getTableInternalSchemaFromCommitMetadata() {
    HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
    return timeline.lastInstant().flatMap(this::getTableInternalSchemaFromCommitMetadata);
  }

  /**
   * Gets the InternalSchema for a hoodie table from the HoodieCommitMetadata of the instant.
   *
   * @return InternalSchema for this table
   */
  public Option getTableInternalSchemaFromCommitMetadata(String timestamp) {
    HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline()
        .filterCompletedInstants()
        .findInstantsBeforeOrEquals(timestamp);
    return timeline.lastInstant().flatMap(this::getTableInternalSchemaFromCommitMetadata);
  }

  /**
   * Gets the InternalSchema for a hoodie table from the HoodieCommitMetadata of the instant.
   *
   * @return InternalSchema for this table
   */
  private Option getTableInternalSchemaFromCommitMetadata(HoodieInstant instant) {
    try {
      HoodieCommitMetadata metadata = getCachedCommitMetadata(instant);
      String latestInternalSchemaStr = metadata.getMetadata(SerDeHelper.LATEST_SCHEMA);
      if (latestInternalSchemaStr != null) {
        return SerDeHelper.fromJson(latestInternalSchemaStr);
      } else {
        return Option.empty();
      }
    } catch (Exception e) {
      throw new HoodieException("Failed to read schema from commit metadata", e);
    }
  }

  /**
   * Gets the history schemas as String for a hoodie table from the HoodieCommitMetadata of the instant.
   *
   * @return history schemas string for this table
   */
  public Option getTableHistorySchemaStrFromCommitMetadata() {
    // now we only support FileBaseInternalSchemaManager
    FileBasedInternalSchemaStorageManager manager = new FileBasedInternalSchemaStorageManager(metaClient);
    String result = manager.getHistorySchemaStr();
    return result.isEmpty() ? Option.empty() : Option.of(result);
  }

  /**
   * NOTE: This method could only be used in tests
   *
   * @VisibleForTesting
   */
  public boolean hasOperationField() {
    try {
      Schema tableAvroSchema = getTableAvroSchemaFromDataFile();
      return tableAvroSchema.getField(HoodieRecord.OPERATION_METADATA_FIELD) != null;
    } catch (Exception e) {
      LOG.info(String.format("Failed to read operation field from avro schema (%s)", e.getMessage()));
      return false;
    }
  }

  private Option> getLatestCommitMetadataWithValidSchema() {
    if (latestCommitWithValidSchema == null) {
      Option> instantAndCommitMetadata =
          metaClient.getActiveTimeline().getLastCommitMetadataWithValidSchema();
      if (instantAndCommitMetadata.isPresent()) {
        HoodieInstant instant = instantAndCommitMetadata.get().getLeft();
        HoodieCommitMetadata metadata = instantAndCommitMetadata.get().getRight();
        synchronized (this) {
          if (latestCommitWithValidSchema == null) {
            latestCommitWithValidSchema = instant;
          }
          commitMetadataCache.get().putIfAbsent(instant, metadata);
        }
      }
    }

    return Option.ofNullable(latestCommitWithValidSchema)
        .map(instant -> Pair.of(instant, commitMetadataCache.get().get(instant)));
  }

  private Option> getLatestCommitMetadataWithValidData() {
    if (latestCommitWithValidData == null) {
      Option> instantAndCommitMetadata =
          metaClient.getActiveTimeline().getLastCommitMetadataWithValidData();
      if (instantAndCommitMetadata.isPresent()) {
        HoodieInstant instant = instantAndCommitMetadata.get().getLeft();
        HoodieCommitMetadata metadata = instantAndCommitMetadata.get().getRight();
        synchronized (this) {
          if (latestCommitWithValidData == null) {
            latestCommitWithValidData = instant;
          }
          commitMetadataCache.get().putIfAbsent(instant, metadata);
        }
      }
    }

    return Option.ofNullable(latestCommitWithValidData)
        .map(instant -> Pair.of(instant, commitMetadataCache.get().get(instant)));
  }

  private HoodieCommitMetadata getCachedCommitMetadata(HoodieInstant instant) {
    return commitMetadataCache.get()
        .computeIfAbsent(instant, (missingInstant) -> {
          HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
          byte[] data = timeline.getInstantDetails(missingInstant).get();
          try {
            return HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class);
          } catch (IOException e) {
            throw new HoodieIOException(String.format("Failed to fetch HoodieCommitMetadata for instant (%s)", missingInstant), e);
          }
        });
  }

  private MessageType fetchSchemaFromFiles(Iterator filePaths) throws IOException {
    MessageType type = null;
    while (filePaths.hasNext() && type == null) {
      String filePath = filePaths.next();
      if (filePath.contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())) {
        // this is a log file
        type = readSchemaFromLogFile(new Path(filePath));
      } else {
        type = readSchemaFromBaseFile(filePath);
      }
    }
    return type;
  }

  private MessageType readSchemaFromBaseFile(String filePath) throws IOException {
    if (filePath.contains(HoodieFileFormat.PARQUET.getFileExtension())) {
      return readSchemaFromParquetBaseFile(new Path(filePath));
    } else if (filePath.contains(HoodieFileFormat.HFILE.getFileExtension())) {
      return readSchemaFromHFileBaseFile(new Path(filePath));
    } else if (filePath.contains(HoodieFileFormat.ORC.getFileExtension())) {
      return readSchemaFromORCBaseFile(new Path(filePath));
    } else {
      throw new IllegalArgumentException("Unknown base file format :" + filePath);
    }
  }

  public static Schema appendPartitionColumns(Schema dataSchema, Option partitionFields) {
    // In cases when {@link DROP_PARTITION_COLUMNS} config is set true, partition columns
    // won't be persisted w/in the data files, and therefore we need to append such columns
    // when schema is parsed from data files
    //
    // Here we append partition columns with {@code StringType} as the data type
    if (!partitionFields.isPresent() || partitionFields.get().length == 0) {
      return dataSchema;
    }

    boolean hasPartitionColNotInSchema = Arrays.stream(partitionFields.get()).anyMatch(pf -> !containsFieldInSchema(dataSchema, pf));
    boolean hasPartitionColInSchema = Arrays.stream(partitionFields.get()).anyMatch(pf -> containsFieldInSchema(dataSchema, pf));
    if (hasPartitionColNotInSchema && hasPartitionColInSchema) {
      throw new HoodieIncompatibleSchemaException("Partition columns could not be partially contained w/in the data schema");
    }

    if (hasPartitionColNotInSchema) {
      // when hasPartitionColNotInSchema is true and hasPartitionColInSchema is false, all partition columns
      // are not in originSchema. So we create and add them.
      List newFields = new ArrayList<>();
      for (String partitionField: partitionFields.get()) {
        newFields.add(new Schema.Field(
            partitionField, createNullableSchema(Schema.Type.STRING), "", JsonProperties.NULL_VALUE));
      }
      return appendFieldsToSchema(dataSchema, newFields);
    }

    return dataSchema;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy