All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.table.action.index.AbstractIndexingCatchupTask Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.table.action.index;

import org.apache.hudi.avro.model.HoodieCleanMetadata;
import org.apache.hudi.avro.model.HoodieRestoreMetadata;
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
import org.apache.hudi.client.heartbeat.HoodieHeartbeatClient;
import org.apache.hudi.client.transaction.TransactionManager;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
import org.apache.hudi.common.util.CleanerUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieIndexException;
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
import org.apache.hudi.table.HoodieTable;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.List;
import java.util.Set;

import static org.apache.hudi.common.table.timeline.HoodieInstant.State.COMPLETED;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.CLEAN_ACTION;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.RESTORE_ACTION;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.ROLLBACK_ACTION;
import static org.apache.hudi.table.action.index.RunIndexActionExecutor.TIMELINE_RELOAD_INTERVAL_MILLIS;

/**
 * Indexing check runs for instants that completed after the base instant (in the index plan).
 * It will check if these later instants have logged updates to metadata table or not.
 * If not, then it will do the update. If a later instant is inflight, it will wait until it is completed or the task times out.
 */
public abstract class AbstractIndexingCatchupTask implements IndexingCatchupTask {
  private static final Logger LOG = LoggerFactory.getLogger(AbstractIndexingCatchupTask.class);

  protected final HoodieTableMetadataWriter metadataWriter;
  protected final List instantsToIndex;
  protected final Set metadataCompletedInstants;
  protected final HoodieTableMetaClient metaClient;
  protected final HoodieTableMetaClient metadataMetaClient;
  protected final TransactionManager transactionManager;
  protected final HoodieEngineContext engineContext;
  protected final HoodieTable table;
  protected final HoodieHeartbeatClient heartbeatClient;
  protected String currentCaughtupInstant;

  public AbstractIndexingCatchupTask(HoodieTableMetadataWriter metadataWriter,
                                     List instantsToIndex,
                                     Set metadataCompletedInstants,
                                     HoodieTableMetaClient metaClient,
                                     HoodieTableMetaClient metadataMetaClient,
                                     TransactionManager transactionManager,
                                     String currentCaughtupInstant,
                                     HoodieEngineContext engineContext,
                                     HoodieTable table,
                                     HoodieHeartbeatClient heartbeatClient) {
    this.metadataWriter = metadataWriter;
    this.instantsToIndex = instantsToIndex;
    this.metadataCompletedInstants = metadataCompletedInstants;
    this.metaClient = metaClient;
    this.metadataMetaClient = metadataMetaClient;
    this.transactionManager = transactionManager;
    this.currentCaughtupInstant = currentCaughtupInstant;
    this.engineContext = engineContext;
    this.table = table;
    this.heartbeatClient = heartbeatClient;
  }

  @Override
  public void run() {
    for (HoodieInstant instant : instantsToIndex) {
      // Already caught up to this instant, or no heartbeat, or heartbeat expired for this instant
      if (awaitInstantCaughtUp(instant)) {
        continue;
      }
      // if instant completed, ensure that there was metadata commit, else update metadata for this completed instant
      if (COMPLETED.equals(instant.getState())) {
        String instantTime = instant.requestedTime();
        Option metadataInstant = metadataMetaClient.reloadActiveTimeline()
            .filterCompletedInstants().filter(i -> i.requestedTime().equals(instantTime)).firstInstant();
        if (metadataInstant.isPresent()) {
          currentCaughtupInstant = instantTime;
          continue;
        }
        try {
          // we need take a lock here as inflight writer could also try to update the timeline
          transactionManager.beginTransaction(Option.of(instant), Option.empty());
          LOG.info("Updating metadata table for instant: " + instant);
          switch (instant.getAction()) {
            case HoodieTimeline.COMMIT_ACTION:
            case HoodieTimeline.DELTA_COMMIT_ACTION:
            case HoodieTimeline.REPLACE_COMMIT_ACTION:
              updateIndexForWriteAction(instant);
              break;
            case CLEAN_ACTION:
              HoodieCleanMetadata cleanMetadata = CleanerUtils.getCleanerMetadata(metaClient, instant);
              metadataWriter.update(cleanMetadata, instant.requestedTime());
              break;
            case RESTORE_ACTION:
              HoodieRestoreMetadata restoreMetadata = TimelineMetadataUtils.deserializeHoodieRestoreMetadata(
                  metaClient.getActiveTimeline().getInstantDetails(instant).get());
              metadataWriter.update(restoreMetadata, instant.requestedTime());
              break;
            case ROLLBACK_ACTION:
              HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeHoodieRollbackMetadata(
                  metaClient.getActiveTimeline().getInstantDetails(instant).get());
              metadataWriter.update(rollbackMetadata, instant.requestedTime());
              break;
            default:
              throw new IllegalStateException("Unexpected value: " + instant.getAction());
          }
        } catch (IOException e) {
          throw new HoodieIndexException(String.format("Could not update metadata partition for instant: %s", instant), e);
        } finally {
          transactionManager.endTransaction(Option.of(instant));
        }
      }
    }
  }

  /**
   * Updates metadata table for the instant. This is only called for actions that do actual writes,
   * i.e. for commit/deltacommit/compaction/replacecommit and not for clean/restore/rollback actions.
   *
   * @param instant HoodieInstant for which to update metadata table
   */
  public abstract void updateIndexForWriteAction(HoodieInstant instant) throws IOException;

  /**
   * For the given instant, this method checks if it is already caught up or not.
   * If not, it waits until the instant is completed.
   * 

* 1. single writer. * a. pending ingestion commit: If no heartbeat, then we are good to ignore. * b. pending table service commit: There won't be any heartbeat. If no heartbeat, then we are good to ignore (strictly assuming single writer and inline table service). *

* 2. streamer + async table service. * a. pending ingestion commit: If no heartbeat, then we are good to ignore. * b. pending table service commit: There won't be any heartbeat. If no heartbeat, then we are good to ignore because we assume that user stops the main writer to create the index. *

* 3. Multi-writer scenarios: * a. Spark datasource ingestion (OR streamer all inline) going on. User is trying to build index via spark-sql concurrently (w/o stopping the main writer) * b. deltastreamer + async table services ongoing. User concurrently builds the index via spark-sql. * c. multi-writer spark-ds writers. User is trying to build index via spark-sql concurrently (w/o stopping the all other writer) * For new indexes added in 1.0.0, these flows are experimental. TODO: HUDI-8607. * * @param instant HoodieInstant to check * @return True if instant is already caught up, or no heartbeat, or expired heartbeat. If heartbeat exists and not expired, then return false. */ boolean awaitInstantCaughtUp(HoodieInstant instant) { if (!metadataCompletedInstants.isEmpty() && metadataCompletedInstants.contains(instant.requestedTime())) { currentCaughtupInstant = instant.requestedTime(); return true; } if (!instant.isCompleted()) { // check heartbeat try { // if no heartbeat, then ignore this instant if (!HoodieHeartbeatClient.heartbeatExists(metaClient.getStorage(), metaClient.getBasePath().toString(), instant.requestedTime())) { LOG.info("Ignoring instant " + instant + " as no heartbeat found"); return true; } // if heartbeat exists, but expired, then ignore this instant if (table.getConfig().getFailedWritesCleanPolicy().isLazy() && heartbeatClient.isHeartbeatExpired(instant.requestedTime())) { LOG.info("Ignoring instant " + instant + " as heartbeat expired"); return true; } } catch (IOException e) { throw new HoodieIOException("Unable to check if heartbeat expired for instant " + instant, e); } try { LOG.warn("instant not completed, reloading timeline " + instant); reloadTimelineWithWait(instant); } catch (InterruptedException e) { throw new HoodieIndexException(String.format("Thread interrupted while running indexing check for instant: %s", instant), e); } } return false; } private void reloadTimelineWithWait(HoodieInstant instant) throws InterruptedException { String instantTime = instant.requestedTime(); Option currentInstant; do { currentInstant = metaClient.reloadActiveTimeline() .filterCompletedInstants().filter(i -> i.requestedTime().equals(instantTime)).firstInstant(); if (!currentInstant.isPresent() || !currentInstant.get().isCompleted()) { Thread.sleep(TIMELINE_RELOAD_INTERVAL_MILLIS); } } while (!currentInstant.isPresent() || !currentInstant.get().isCompleted()); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy