All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.common.util.CommitUtils Maven / Gradle / Ivy

There is a newer version: 1.0.0-beta2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.common.util;

import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieReplaceCommitMetadata;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;

import org.apache.avro.Schema;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Helper class to generate commit metadata.
 */
public class CommitUtils {

  private static final Logger LOG = LogManager.getLogger(CommitUtils.class);
  private static final String NULL_SCHEMA_STR = Schema.create(Schema.Type.NULL).toString();

  /**
   * Gets the commit action type for given write operation and table type.
   * Use this API when commit action type can differ not only on the basis of table type but also write operation type.
   * For example, INSERT_OVERWRITE/INSERT_OVERWRITE_TABLE operations have REPLACE commit action type.
   */
  public static String getCommitActionType(WriteOperationType operation, HoodieTableType tableType) {
    if (operation == WriteOperationType.INSERT_OVERWRITE || operation == WriteOperationType.INSERT_OVERWRITE_TABLE
        || operation == WriteOperationType.DELETE_PARTITION) {
      return HoodieTimeline.REPLACE_COMMIT_ACTION;
    } else {
      return getCommitActionType(tableType);
    }
  }

  /**
   * Gets the commit action type for given table type.
   * Note: Use this API only when the commit action type is not dependent on the write operation type.
   * See {@link CommitUtils#getCommitActionType(WriteOperationType, HoodieTableType)} for more details.
   */
  public static String getCommitActionType(HoodieTableType tableType) {
    switch (tableType) {
      case COPY_ON_WRITE:
        return HoodieActiveTimeline.COMMIT_ACTION;
      case MERGE_ON_READ:
        return HoodieActiveTimeline.DELTA_COMMIT_ACTION;
      default:
        throw new HoodieException("Could not commit on unknown table type " + tableType);
    }
  }

  public static HoodieCommitMetadata buildMetadata(List writeStats,
                                                   Map> partitionToReplaceFileIds,
                                                   Option> extraMetadata,
                                                   WriteOperationType operationType,
                                                   String schemaToStoreInCommit,
                                                   String commitActionType) {

    HoodieCommitMetadata commitMetadata = buildMetadataFromStats(writeStats, partitionToReplaceFileIds, commitActionType, operationType);

    // add in extra metadata
    if (extraMetadata.isPresent()) {
      extraMetadata.get().forEach(commitMetadata::addMetadata);
    }
    commitMetadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, (schemaToStoreInCommit == null || schemaToStoreInCommit.equals(NULL_SCHEMA_STR))
        ? "" : schemaToStoreInCommit);
    commitMetadata.setOperationType(operationType);
    return commitMetadata;
  }

  private static HoodieCommitMetadata buildMetadataFromStats(List writeStats,
                                                             Map> partitionToReplaceFileIds,
                                                             String commitActionType,
                                                             WriteOperationType operationType) {
    final HoodieCommitMetadata commitMetadata;
    if (HoodieTimeline.REPLACE_COMMIT_ACTION.equals(commitActionType)) {
      HoodieReplaceCommitMetadata replaceMetadata = new HoodieReplaceCommitMetadata();
      replaceMetadata.setPartitionToReplaceFileIds(partitionToReplaceFileIds);
      commitMetadata = replaceMetadata;
    } else {
      commitMetadata = new HoodieCommitMetadata();
    }

    for (HoodieWriteStat writeStat : writeStats) {
      String partition = writeStat.getPartitionPath();
      commitMetadata.addWriteStat(partition, writeStat);
    }

    LOG.info("Creating  metadata for " + operationType + " numWriteStats:" + writeStats.size()
        + " numReplaceFileIds:" + partitionToReplaceFileIds.values().stream().mapToInt(e -> e.size()).sum());
    return commitMetadata;
  }

  public static HashMap getFileIdWithoutSuffixAndRelativePathsFromSpecificRecord(Map>
                                                                                       partitionToWriteStats) {
    HashMap fileIdToPath = new HashMap<>();
    // list all partitions paths
    for (Map.Entry> entry : partitionToWriteStats.entrySet()) {
      for (org.apache.hudi.avro.model.HoodieWriteStat stat : entry.getValue()) {
        fileIdToPath.put(stat.getFileId(), stat.getPath());
      }
    }
    return fileIdToPath;
  }

  public static HashMap getFileIdWithoutSuffixAndRelativePaths(Map>
      partitionToWriteStats) {
    HashMap fileIdToPath = new HashMap<>();
    // list all partitions paths
    for (Map.Entry> entry : partitionToWriteStats.entrySet()) {
      for (HoodieWriteStat stat : entry.getValue()) {
        fileIdToPath.put(stat.getFileId(), stat.getPath());
      }
    }
    return fileIdToPath;
  }

  /**
   * Process previous commits metadata in the timeline to determine the checkpoint given a checkpoint key.
   * NOTE: This is very similar in intent to DeltaSync#getLatestCommitMetadataWithValidCheckpointInfo except that
   *       different deployment models (deltastreamer or spark structured streaming) could have different checkpoint keys.
   *
   * @param timeline completed commits in active timeline.
   * @param checkpointKey the checkpoint key in the extra metadata of the commit.
   * @return An optional commit metadata with latest checkpoint.
   */
  public static Option getLatestCommitMetadataWithValidCheckpointInfo(HoodieTimeline timeline, String checkpointKey) {
    return (Option) timeline.getReverseOrderedInstants().map(instant -> {
      try {
        HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
            .fromBytes(timeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class);
        if (StringUtils.nonEmpty(commitMetadata.getMetadata(checkpointKey))) {
          return Option.of(commitMetadata);
        } else {
          return Option.empty();
        }
      } catch (IOException e) {
        throw new HoodieIOException("Failed to parse HoodieCommitMetadata for " + instant.toString(), e);
      }
    }).filter(Option::isPresent).findFirst().orElse(Option.empty());
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy