org.apache.hudi.sink.clustering.ClusteringCommitSink Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-flink1.17-bundle Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.sink.clustering;

import org.apache.hudi.avro.model.HoodieClusteringGroup;
import org.apache.hudi.avro.model.HoodieClusteringPlan;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieFileGroupId;
import org.apache.hudi.common.model.TableServiceType;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.ClusteringUtils;
import org.apache.hudi.common.util.CommitUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.exception.HoodieClusteringException;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.metrics.FlinkClusteringMetrics;
import org.apache.hudi.sink.CleanFunction;
import org.apache.hudi.table.HoodieFlinkTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.hudi.util.ClusteringUtil;
import org.apache.hudi.util.FlinkWriteClients;

import org.apache.flink.configuration.Configuration;
import org.apache.flink.metrics.MetricGroup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * Function to check and commit the clustering action.
 *
 *  Each time after receiving a clustering commit event {@link ClusteringCommitEvent},
 * it loads and checks the clustering plan {@link org.apache.hudi.avro.model.HoodieClusteringPlan},
 * if all the clustering operations {@link org.apache.hudi.common.model.ClusteringOperation}
 * of the plan are finished, tries to commit the clustering action.
 *
 * 
It also inherits the {@link CleanFunction} cleaning ability. This is needed because
 * the SQL API does not allow multiple sinks in one table sink provider.
 */
public class ClusteringCommitSink extends CleanFunction {
  private static final Logger LOG = LoggerFactory.getLogger(ClusteringCommitSink.class);

  /**
   * Config options.
   */
  private final Configuration conf;

  private transient HoodieFlinkTable table;

  /**
   * Buffer to collect the event from each clustering task {@code ClusteringFunction}.
   *
   * Stores the mapping of instant_time -> file_ids -> event. Use a map to collect the
   * events because the rolling back of intermediate clustering tasks generates corrupt
   * events.
   */
  private transient Map> commitBuffer;

  /**
   * Cache to store clustering plan for each instant.
   * Stores the mapping of instant_time -> clusteringPlan.
   */
  private transient Map clusteringPlanCache;

  private transient FlinkClusteringMetrics clusteringMetrics;

  public ClusteringCommitSink(Configuration conf) {
    super(conf);
    this.conf = conf;
  }

  @Override
  public void open(Configuration parameters) throws Exception {
    super.open(parameters);
    if (writeClient == null) {
      this.writeClient = FlinkWriteClients.createWriteClient(conf, getRuntimeContext());
    }
    this.commitBuffer = new HashMap<>();
    this.clusteringPlanCache = new HashMap<>();
    this.table = writeClient.getHoodieTable();
    registerMetrics();
  }

  @Override
  public void invoke(ClusteringCommitEvent event, Context context) throws Exception {
    final String instant = event.getInstant();
    if (event.isFailed()
        || (event.getWriteStatuses() != null
        && event.getWriteStatuses().stream().anyMatch(writeStatus -> writeStatus.getTotalErrorRecords() > 0))) {
      LOG.warn("Receive abnormal ClusteringCommitEvent of instant {}, task ID is {},"
              + " is failed: {}, error record count: {}",
          instant, event.getTaskID(), event.isFailed(), getNumErrorRecords(event));
    }
    commitBuffer.computeIfAbsent(instant, k -> new HashMap<>())
        .put(event.getFileIds(), event);
    commitIfNecessary(instant, commitBuffer.get(instant).values());
  }

  private long getNumErrorRecords(ClusteringCommitEvent event) {
    if (event.getWriteStatuses() == null) {
      return -1L;
    }
    return event.getWriteStatuses().stream()
        .map(WriteStatus::getTotalErrorRecords).reduce(Long::sum).orElse(0L);
  }

  /**
   * Condition to commit: the commit buffer has equal size with the clustering plan operations
   * and all the clustering commit event {@link ClusteringCommitEvent} has the same clustering instant time.
   *
   * @param instant Clustering commit instant time
   * @param events  Commit events ever received for the instant
   */
  private void commitIfNecessary(String instant, Collection events) {
    HoodieClusteringPlan clusteringPlan = clusteringPlanCache.computeIfAbsent(instant, k -> {
      try {
        HoodieTableMetaClient metaClient = this.writeClient.getHoodieTable().getMetaClient();
        return ClusteringUtils.getInflightClusteringInstant(instant, metaClient.getActiveTimeline(), table.getInstantGenerator())
            .flatMap(pendingInstant -> ClusteringUtils.getClusteringPlan(
            metaClient, pendingInstant))
            .map(Pair::getRight)
            .orElse(null);
      } catch (Exception e) {
        throw new HoodieException(e);
      }
    });

    if (clusteringPlan == null) {
      return;
    }

    boolean isReady = clusteringPlan.getInputGroups().size() == events.size();
    if (!isReady) {
      return;
    }

    if (events.stream().anyMatch(ClusteringCommitEvent::isFailed)) {
      try {
        // handle failure case
        ClusteringUtil.rollbackClustering(table, writeClient, instant);
      } finally {
        // remove commitBuffer to avoid obsolete metadata commit
        reset(instant);
      }
      return;
    }

    try {
      doCommit(instant, clusteringPlan, events);
    } catch (Throwable throwable) {
      // make it fail-safe
      LOG.error("Error while committing clustering instant: " + instant, throwable);
    } finally {
      // reset the status
      reset(instant);
    }
  }

  private void doCommit(String instant, HoodieClusteringPlan clusteringPlan, Collection events) {
    List statuses = events.stream()
        .map(ClusteringCommitEvent::getWriteStatuses)
        .flatMap(Collection::stream)
        .collect(Collectors.toList());

    long numErrorRecords = statuses.stream().map(WriteStatus::getTotalErrorRecords).reduce(Long::sum).orElse(0L);

    if (numErrorRecords > 0 && !this.conf.getBoolean(FlinkOptions.IGNORE_FAILED)) {
      // handle failure case
      LOG.error("Got {} error records during clustering of instant {},\n"
          + "option '{}' is configured as false,"
          + "rolls back the clustering", numErrorRecords, instant, FlinkOptions.IGNORE_FAILED.key());
      ClusteringUtil.rollbackClustering(table, writeClient, instant);
      return;
    }

    HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>();
    writeMetadata.setWriteStatuses(statuses);
    writeMetadata.setWriteStats(statuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()));
    writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(clusteringPlan, writeMetadata));
    validateWriteResult(clusteringPlan, instant, writeMetadata);
    if (!writeMetadata.getCommitMetadata().isPresent()) {
      HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(
          writeMetadata.getWriteStats().get(),
          writeMetadata.getPartitionToReplaceFileIds(),
          Option.empty(),
          WriteOperationType.CLUSTER,
          this.writeClient.getConfig().getSchema(),
          HoodieTimeline.REPLACE_COMMIT_ACTION);
      writeMetadata.setCommitMetadata(Option.of(commitMetadata));
    }
    // commit the clustering
    this.table.getMetaClient().reloadActiveTimeline();
    this.writeClient.completeTableService(TableServiceType.CLUSTER, writeMetadata.getCommitMetadata().get(), table, instant);

    clusteringMetrics.updateCommitMetrics(instant, writeMetadata.getCommitMetadata().get());
    // whether to clean up the input base parquet files used for clustering
    if (!conf.getBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED) && !isCleaning) {
      LOG.info("Running inline clean");
      this.writeClient.clean();
    }
  }

  private void reset(String instant) {
    this.commitBuffer.remove(instant);
    this.clusteringPlanCache.remove(instant);
  }

  /**
   * Validate actions taken by clustering. In the first implementation, we validate at least one new file is written.
   * But we can extend this to add more validation. E.g. number of records read = number of records written etc.
   * We can also make these validations in BaseCommitActionExecutor to reuse pre-commit hooks for multiple actions.
   */
  private static void validateWriteResult(HoodieClusteringPlan clusteringPlan, String instantTime, HoodieWriteMetadata> writeMetadata) {
    if (writeMetadata.getWriteStatuses().isEmpty()) {
      throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + instantTime
          + " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least "
          + clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum()
          + " write statuses");
    }
  }

  private static Map> getPartitionToReplacedFileIds(
      HoodieClusteringPlan clusteringPlan,
      HoodieWriteMetadata> writeMetadata) {
    Set newFilesWritten = writeMetadata.getWriteStats().get().stream()
        .map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet());
    return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan)
        .filter(fg -> !newFilesWritten.contains(fg))
        .collect(Collectors.groupingBy(HoodieFileGroupId::getPartitionPath, Collectors.mapping(HoodieFileGroupId::getFileId, Collectors.toList())));
  }

  private void registerMetrics() {
    MetricGroup metrics = getRuntimeContext().getMetricGroup();
    clusteringMetrics = new FlinkClusteringMetrics(metrics);
    clusteringMetrics.registerMetrics();
  }
}