All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.client.CompactionAdminClient Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.client;

import org.apache.hudi.avro.model.HoodieCompactionOperation;
import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.CompactionOperation;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieFileGroupId;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
import org.apache.hudi.common.table.timeline.InstantFileNameGenerator;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.CompactionUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.storage.StoragePathInfo;
import org.apache.hudi.table.action.compact.OperationResult;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMPACTION_ACTION;
import static org.apache.hudi.common.table.timeline.InstantComparison.GREATER_THAN_OR_EQUALS;
import static org.apache.hudi.common.table.timeline.InstantComparison.compareTimestamps;

/**
 * Client to perform admin operations related to compaction.
 */
public class CompactionAdminClient extends BaseHoodieClient {

  private static final Logger LOG = LoggerFactory.getLogger(CompactionAdminClient.class);

  public CompactionAdminClient(HoodieEngineContext context, String basePath) {
    super(context, HoodieWriteConfig.newBuilder().withPath(basePath).build());
  }

  /**
   * Validate all compaction operations in a compaction plan. Verifies the file-slices are consistent with corresponding
   * compaction operations.
   *
   * @param metaClient Hoodie Table Meta Client
   * @param compactionInstant Compaction Instant
   */
  public List validateCompactionPlan(HoodieTableMetaClient metaClient, String compactionInstant,
      int parallelism) throws IOException {
    HoodieCompactionPlan plan = getCompactionPlan(metaClient, compactionInstant);
    HoodieTableFileSystemView fsView =
        new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());

    if (plan.getOperations() != null) {
      List ops = plan.getOperations().stream()
          .map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList());
      context.setJobStatus(this.getClass().getSimpleName(), "Validate compaction operations: " + config.getTableName());
      return context.map(ops, op -> {
        try {
          return validateCompactionOperation(metaClient, compactionInstant, op, Option.of(fsView));
        } catch (IOException e) {
          throw new HoodieIOException(e.getMessage(), e);
        }
      }, parallelism);
    }
    return new ArrayList<>();
  }

  /**
   * Un-schedules compaction plan. Remove All compaction operation scheduled.
   *
   * @param compactionInstant Compaction Instant
   * @param skipValidation Skip validation step
   * @param parallelism Parallelism
   * @param dryRun Dry Run
   */
  public List unscheduleCompactionPlan(String compactionInstant, boolean skipValidation,
      int parallelism, boolean dryRun) throws Exception {
    HoodieTableMetaClient metaClient = createMetaClient(false);
    InstantFileNameGenerator instantFileNameGenerator = metaClient.getInstantFileNameGenerator();
    // Only if all operations are successfully executed
    if (!dryRun) {
      // Overwrite compaction request with empty compaction operations
      HoodieInstant inflight = metaClient.createNewInstant(State.INFLIGHT, COMPACTION_ACTION, compactionInstant);
      StoragePath inflightPath = new StoragePath(metaClient.getTimelinePath(), metaClient.getInstantFileNameGenerator().getFileName(inflight));
      if (metaClient.getStorage().exists(inflightPath)) {
        // We need to rollback data-files because of this inflight compaction before unscheduling
        throw new IllegalStateException("Please rollback the inflight compaction before unscheduling");
      }
      // Leave the trace in aux folder but delete from metapath.
      // TODO: Add a rollback instant but for compaction
      HoodieInstant instant = metaClient.createNewInstant(State.REQUESTED, COMPACTION_ACTION, compactionInstant);
      boolean deleted = metaClient.getStorage().deleteFile(
          new StoragePath(metaClient.getTimelinePath(), instantFileNameGenerator.getFileName(instant)));
      ValidationUtils.checkArgument(deleted, "Unable to delete compaction instant.");
    }
    return new ArrayList<>();
  }

  /**
   * Remove a fileId from pending compaction. Removes the associated compaction operation and rename delta-files that
   * were generated for that file-id after the compaction operation was scheduled.
   *
   * This operation MUST be executed with compactions and writer turned OFF.
   *
   * @param fgId FileGroupId to be unscheduled
   * @param skipValidation Skip validation
   * @param dryRun Dry Run Mode
   */
  public List unscheduleCompactionFileId(HoodieFileGroupId fgId, boolean skipValidation, boolean dryRun)
      throws Exception {
    HoodieTableMetaClient metaClient = createMetaClient(false);
    InstantFileNameGenerator instantFileNameGenerator = metaClient.getInstantFileNameGenerator();

    if (!dryRun) {
      // Ready to remove this file-Id from compaction request
      Pair compactionOperationWithInstant =
          CompactionUtils.getAllPendingCompactionOperations(metaClient).get(fgId);
      HoodieCompactionPlan plan =
          CompactionUtils.getCompactionPlan(metaClient, compactionOperationWithInstant.getKey());
      List newOps = plan.getOperations().stream().filter(op ->
              (!op.getFileId().equals(fgId.getFileId())) && (!op.getPartitionPath().equals(fgId.getPartitionPath())))
          .collect(Collectors.toList());
      if (newOps.size() == plan.getOperations().size()) {
        return new ArrayList<>();
      }
      HoodieCompactionPlan newPlan =
          HoodieCompactionPlan.newBuilder().setOperations(newOps).setExtraMetadata(plan.getExtraMetadata()).build();
      HoodieInstant inflight =
          metaClient.createNewInstant(State.INFLIGHT, COMPACTION_ACTION, compactionOperationWithInstant.getLeft());
      StoragePath inflightPath = new StoragePath(metaClient.getTimelinePath(), instantFileNameGenerator.getFileName(inflight));
      if (metaClient.getStorage().exists(inflightPath)) {
        // revert if in inflight state
        metaClient.getActiveTimeline().revertInstantFromInflightToRequested(inflight);
      }
      // Overwrite compaction plan with updated info
      metaClient.getActiveTimeline().saveToCompactionRequested(
          metaClient.createNewInstant(State.REQUESTED, COMPACTION_ACTION, compactionOperationWithInstant.getLeft()),
          TimelineMetadataUtils.serializeCompactionPlan(newPlan), true);
    }
    return new ArrayList<>();
  }

  /**
   * Renames delta files to make file-slices consistent with the timeline as dictated by Hoodie metadata. Use when
   * compaction unschedule fails partially.
   *
   * This operation MUST be executed with compactions and writer turned OFF.
   * 
   * @param compactionInstant Compaction Instant to be repaired
   * @param dryRun Dry Run Mode
   */
  public List repairCompaction(String compactionInstant, int parallelism, boolean dryRun)
      throws Exception {
    HoodieTableMetaClient metaClient = createMetaClient(false);
    validateCompactionPlan(metaClient, compactionInstant, parallelism);

    return new ArrayList<>();
  }

  /**
   * Construction Compaction Plan from compaction instant.
   */
  private static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient, String compactionInstant)
      throws IOException {
    return TimelineMetadataUtils.deserializeCompactionPlan(
            metaClient.getActiveTimeline().readCompactionPlanAsBytes(
                    metaClient.getInstantGenerator().getCompactionRequestedInstant(compactionInstant)).get());
  }

  /**
   * Rename log files. This is done for un-scheduling a pending compaction operation NOTE: Can only be used safely when
   * no writer (ingestion/compaction) is running.
   *
   * @param metaClient Hoodie Table Meta-Client
   * @param oldLogFile Old Log File
   * @param newLogFile New Log File
   */
  protected static void renameLogFile(HoodieTableMetaClient metaClient, HoodieLogFile oldLogFile,
      HoodieLogFile newLogFile) throws IOException {
    List pathInfoList =
        metaClient.getStorage().listDirectEntries(oldLogFile.getPath());
    ValidationUtils.checkArgument(pathInfoList.size() == 1, "Only one status must be present");
    ValidationUtils.checkArgument(pathInfoList.get(0).isFile(), "Source File must exist");
    ValidationUtils.checkArgument(
        oldLogFile.getPath().getParent().equals(newLogFile.getPath().getParent()),
        "Log file must only be moved within the parent directory");
    metaClient.getStorage().rename(oldLogFile.getPath(), newLogFile.getPath());
  }

  /**
   * Check if a compaction operation is valid.
   *
   * @param metaClient Hoodie Table Meta client
   * @param compactionInstant Compaction Instant
   * @param operation Compaction Operation
   * @param fsViewOpt File System View
   */
  private ValidationOpResult validateCompactionOperation(HoodieTableMetaClient metaClient, String compactionInstant,
      CompactionOperation operation, Option fsViewOpt) throws IOException {
    HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get()
        : new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
    Option lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant();
    try {
      if (lastInstant.isPresent()) {
        Option fileSliceOptional =
            Option.fromJavaOptional(fileSystemView.getLatestUnCompactedFileSlices(operation.getPartitionPath())
                .filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst());
        if (fileSliceOptional.isPresent()) {
          FileSlice fs = fileSliceOptional.get();
          Option df = fs.getBaseFile();
          if (operation.getDataFileName().isPresent()) {
            String expPath = metaClient.getStorage()
                .getPathInfo(new StoragePath(
                    FSUtils.constructAbsolutePath(metaClient.getBasePath(), operation.getPartitionPath()),
                    operation.getDataFileName().get()))
                .getPath().toString();
            ValidationUtils.checkArgument(df.isPresent(),
                "Data File must be present. File Slice was : " + fs + ", operation :" + operation);
            ValidationUtils.checkArgument(df.get().getPath().equals(expPath),
                "Base Path in operation is specified as " + expPath + " but got path " + df.get().getPath());
          }
          Set logFilesInFileSlice = fs.getLogFiles().collect(Collectors.toSet());
          Set logFilesInCompactionOp = operation.getDeltaFileNames().stream().map(dp -> {
            try {
              List pathInfoList = metaClient.getStorage()
                  .listDirectEntries(new StoragePath(
                      FSUtils.constructAbsolutePath(metaClient.getBasePath(), operation.getPartitionPath()), dp));
              ValidationUtils.checkArgument(pathInfoList.size() == 1, "Expect only 1 file-status");
              return new HoodieLogFile(pathInfoList.get(0));
            } catch (FileNotFoundException fe) {
              throw new CompactionValidationException(fe.getMessage());
            } catch (IOException ioe) {
              throw new HoodieIOException(ioe.getMessage(), ioe);
            }
          }).collect(Collectors.toSet());
          Set missing = logFilesInCompactionOp.stream().filter(lf -> !logFilesInFileSlice.contains(lf))
              .collect(Collectors.toSet());
          ValidationUtils.checkArgument(missing.isEmpty(),
              "All log files specified in compaction operation is not present. Missing :" + missing + ", Exp :"
                  + logFilesInCompactionOp + ", Got :" + logFilesInFileSlice);
          Set diff = logFilesInFileSlice.stream().filter(lf -> !logFilesInCompactionOp.contains(lf))
              .collect(Collectors.toSet());
          ValidationUtils.checkArgument(diff.stream().allMatch(lf -> compareTimestamps(lf.getDeltaCommitTime(), GREATER_THAN_OR_EQUALS, compactionInstant)),
              "There are some log-files which are neither specified in compaction plan "
                  + "nor present after compaction request instant. Some of these :" + diff);
        } else {
          throw new CompactionValidationException(
              "Unable to find file-slice for file-id (" + operation.getFileId() + " Compaction operation is invalid.");
        }
      } else {
        throw new CompactionValidationException(
            "Unable to find any committed instant. Compaction Operation may be pointing to stale file-slices");
      }
    } catch (CompactionValidationException | IllegalArgumentException e) {
      return new ValidationOpResult(operation, false, Option.of(e));
    }
    return new ValidationOpResult(operation, true, Option.empty());
  }

  /**
   * Execute Renaming operation.
   *
   * @param metaClient HoodieTable MetaClient
   * @param renameActions List of rename operations
   */
  private List runRenamingOps(HoodieTableMetaClient metaClient,
      List> renameActions, int parallelism, boolean dryRun) {
    if (renameActions.isEmpty()) {
      LOG.info("No renaming of log-files needed. Proceeding to removing file-id from compaction-plan");
      return new ArrayList<>();
    } else {
      LOG.info("The following compaction renaming operations needs to be performed to un-schedule");
      if (!dryRun) {
        context.setJobStatus(this.getClass().getSimpleName(), "Execute unschedule operations: " + config.getTableName());
        return context.map(renameActions, lfPair -> {
          try {
            LOG.info("RENAME " + lfPair.getLeft().getPath() + " => " + lfPair.getRight().getPath());
            renameLogFile(metaClient, lfPair.getLeft(), lfPair.getRight());
            return new RenameOpResult(lfPair, true, Option.empty());
          } catch (IOException e) {
            LOG.error("Error renaming log file", e);
            LOG.error("\n\n\n***NOTE Compaction is in inconsistent state. Try running \"compaction repair "
                + lfPair.getLeft().getDeltaCommitTime() + "\" to recover from failure ***\n\n\n");
            return new RenameOpResult(lfPair, false, Option.of(e));
          }
        }, parallelism);
      } else {
        LOG.info("Dry-Run Mode activated for rename operations");
        return renameActions.parallelStream().map(lfPair -> new RenameOpResult(lfPair, false, false, Option.empty()))
            .collect(Collectors.toList());
      }
    }
  }

  /**
   * Holds Operation result for Renaming.
   */
  public static class RenameOpResult extends OperationResult {

    public RenameOpResult() {
    }

    public RenameOpResult(Pair op, boolean success, Option exception) {
      super(
          new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(), op.getRight().getPath().toString()),
          success, exception);
    }

    public RenameOpResult(Pair op, boolean executed, boolean success,
        Option exception) {
      super(
          new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(), op.getRight().getPath().toString()),
          executed, success, exception);
    }
  }

  /**
   * Holds Operation result for Renaming.
   */
  public static class ValidationOpResult extends OperationResult {
    public ValidationOpResult(CompactionOperation operation, boolean success, Option exception) {
      super(operation, success, exception);
    }
  }

  public static class RenameInfo implements Serializable {

    public String fileId;
    public String srcPath;
    public String destPath;

    public RenameInfo(String fileId, String srcPath, String destPath) {
      this.fileId = fileId;
      this.srcPath = srcPath;
      this.destPath = destPath;
    }
  }

  public static class CompactionValidationException extends RuntimeException {

    public CompactionValidationException(String msg) {
      super(msg);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy