org.apache.hudi.utilities.HoodieClusteringJob Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.utilities;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.TimelineUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.config.HoodieCleanConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.table.HoodieSparkTable;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import org.apache.spark.api.java.JavaSparkContext;
import org.jetbrains.annotations.TestOnly;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import static org.apache.hudi.utilities.UtilHelpers.EXECUTE;
import static org.apache.hudi.utilities.UtilHelpers.PURGE_PENDING_INSTANT;
import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE;
import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE_AND_EXECUTE;
public class HoodieClusteringJob {
private static final Logger LOG = LoggerFactory.getLogger(HoodieClusteringJob.class);
private final Config cfg;
private final TypedProperties props;
private final JavaSparkContext jsc;
private HoodieTableMetaClient metaClient;
public HoodieClusteringJob(JavaSparkContext jsc, Config cfg) {
this(jsc, cfg, UtilHelpers.buildProperties(jsc.hadoopConfiguration(), cfg.propsFilePath, cfg.configs),
UtilHelpers.createMetaClient(jsc, cfg.basePath, true));
}
public HoodieClusteringJob(JavaSparkContext jsc, Config cfg, TypedProperties props, HoodieTableMetaClient metaClient) {
this.cfg = cfg;
this.jsc = jsc;
this.props = props;
this.metaClient = metaClient;
// Disable async cleaning, will trigger synchronous cleaning manually.
this.props.put(HoodieCleanConfig.ASYNC_CLEAN.key(), false);
if (this.metaClient.getTableConfig().isMetadataTableAvailable()) {
// add default lock config options if MDT is enabled.
UtilHelpers.addLockOptions(cfg.basePath, this.metaClient.getBasePath().toUri().getScheme(), this.props);
}
}
public static class Config implements Serializable {
@Parameter(names = {"--base-path", "-sp"}, description = "Base path for the table", required = true)
public String basePath = null;
@Parameter(names = {"--table-name", "-tn"}, description = "Table name", required = true)
public String tableName = null;
@Parameter(names = {"--instant-time", "-it"}, description = "Clustering Instant time, only used when set --mode execute. "
+ "If the instant time is not provided with --mode execute, "
+ "the earliest scheduled clustering instant time is used by default. "
+ "When set \"--mode scheduleAndExecute\" this instant-time will be ignored.")
public String clusteringInstantTime = null;
@Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for hoodie insert")
public int parallelism = 1;
@Parameter(names = {"--spark-master", "-ms"}, description = "Spark master")
public String sparkMaster = null;
@Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = false)
public String sparkMemory = null;
@Parameter(names = {"--retry", "-rt"}, description = "number of retries")
public int retry = 0;
@Parameter(names = {"--skip-clean", "-sc"}, description = "do not trigger clean after clustering", required = false)
public Boolean skipClean = true;
@Parameter(names = {"--schedule", "-sc"}, description = "Schedule clustering @desperate soon please use \"--mode schedule\" instead")
public Boolean runSchedule = false;
@Parameter(names = {"--retry-last-failed-clustering-job", "-rc"}, description = "Take effect when using --mode/-m scheduleAndExecute. Set true means "
+ "check, rollback and execute last failed clustering plan instead of planing a new clustering job directly.")
public Boolean retryLastFailedClusteringJob = false;
@Parameter(names = {"--mode", "-m"}, description = "Set job mode: Set \"schedule\" means make a cluster plan; "
+ "Set \"execute\" means execute a cluster plan at given instant which means --instant-time is needed here; "
+ "Set \"scheduleAndExecute\" means make a cluster plan first and execute that plan immediately")
public String runningMode = null;
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
@Parameter(names = {"--job-max-processing-time-ms", "-jt"}, description = "Take effect when using --mode/-m scheduleAndExecute and --retry-last-failed-clustering-job/-rc true. "
+ "If maxProcessingTimeMs passed but clustering job is still unfinished, hoodie would consider this job as failed and relaunch.")
public long maxProcessingTimeMs = 0;
@Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
+ "hoodie client for clustering")
public String propsFilePath = null;
@Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file "
+ "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated",
splitter = IdentitySplitter.class)
public List configs = new ArrayList<>();
@Override
public String toString() {
return "HoodieClusteringJobConfig{\n"
+ " --base-path " + basePath + ", \n"
+ " --table-name " + tableName + ", \n"
+ " --instant-time " + clusteringInstantTime + ", \n"
+ " --parallelism " + parallelism + ", \n"
+ " --spark-master " + sparkMaster + ", \n"
+ " --spark-memory " + sparkMemory + ", \n"
+ " --retry " + retry + ", \n"
+ " --skipClean " + skipClean + ", \n"
+ " --schedule " + runSchedule + ", \n"
+ " --retry-last-failed-clustering-job " + retryLastFailedClusteringJob + ", \n"
+ " --mode " + runningMode + ", \n"
+ " --job-max-processing-time-ms " + maxProcessingTimeMs + ", \n"
+ " --props " + propsFilePath + ", \n"
+ " --hoodie-conf " + configs + ", \n"
+ "\n}";
}
}
public static void main(String[] args) {
final Config cfg = new Config();
JCommander cmd = new JCommander(cfg, null, args);
if (cfg.help || args.length == 0) {
cmd.usage();
throw new HoodieException("Clustering failed for basePath: " + cfg.basePath);
}
final JavaSparkContext jsc = UtilHelpers.buildSparkContext("clustering-" + cfg.tableName, cfg.sparkMaster, cfg.sparkMemory);
int result = new HoodieClusteringJob(jsc, cfg).cluster(cfg.retry);
String resultMsg = String.format("Clustering with basePath: %s, tableName: %s, runningMode: %s",
cfg.basePath, cfg.tableName, cfg.runningMode);
if (result != 0) {
throw new HoodieException(resultMsg + " failed");
}
LOG.info(resultMsg + " success");
jsc.stop();
}
// make sure that cfg.runningMode couldn't be null
private static void validateRunningMode(Config cfg) {
// --mode has a higher priority than --schedule
// If we remove --schedule option in the future we need to change runningMode default value to EXECUTE
if (StringUtils.isNullOrEmpty(cfg.runningMode)) {
cfg.runningMode = cfg.runSchedule ? SCHEDULE : EXECUTE;
}
}
public int cluster(int retry) {
// need to do validate in case that users call cluster() directly without setting cfg.runningMode
validateRunningMode(cfg);
return UtilHelpers.retry(retry, () -> {
switch (cfg.runningMode.toLowerCase()) {
case SCHEDULE: {
LOG.info("Running Mode: [" + SCHEDULE + "]; Do schedule");
Option instantTime = doSchedule(jsc);
int result = instantTime.isPresent() ? 0 : -1;
if (result == 0) {
LOG.info("The schedule instant time is " + instantTime.get());
}
return result;
}
case SCHEDULE_AND_EXECUTE: {
LOG.info("Running Mode: [" + SCHEDULE_AND_EXECUTE + "]");
return doScheduleAndCluster(jsc);
}
case EXECUTE: {
LOG.info("Running Mode: [" + EXECUTE + "]; Do cluster");
return doCluster(jsc);
}
case PURGE_PENDING_INSTANT: {
LOG.info("Running Mode: [" + PURGE_PENDING_INSTANT + "];");
return doPurgePendingInstant(jsc);
}
default: {
LOG.error("Unsupported running mode [" + cfg.runningMode + "], quit the job directly");
return -1;
}
}
}, "Cluster failed");
}
private int doCluster(JavaSparkContext jsc) throws Exception {
metaClient = HoodieTableMetaClient.reload(metaClient);
String schemaStr = UtilHelpers.getSchemaFromLatestInstant(metaClient);
try (SparkRDDWriteClient client = UtilHelpers.createHoodieClient(jsc, cfg.basePath, schemaStr, cfg.parallelism, Option.empty(), props)) {
if (StringUtils.isNullOrEmpty(cfg.clusteringInstantTime)) {
// Instant time is not specified
// Find the earliest scheduled clustering instant for execution
Option firstClusteringInstant =
metaClient.getActiveTimeline().getFirstPendingClusterInstant();
if (firstClusteringInstant.isPresent()) {
cfg.clusteringInstantTime = firstClusteringInstant.get().requestedTime();
LOG.info("Found the earliest scheduled clustering instant which will be executed: "
+ cfg.clusteringInstantTime);
} else {
LOG.info("There is no scheduled clustering in the table.");
return 0;
}
}
Option commitMetadata = client.cluster(cfg.clusteringInstantTime).getCommitMetadata();
clean(client);
return UtilHelpers.handleErrors(commitMetadata.get(), cfg.clusteringInstantTime);
}
}
@TestOnly
public Option doSchedule() throws Exception {
return this.doSchedule(jsc);
}
private Option doSchedule(JavaSparkContext jsc) throws Exception {
metaClient = HoodieTableMetaClient.reload(metaClient);
String schemaStr = UtilHelpers.getSchemaFromLatestInstant(metaClient);
try (SparkRDDWriteClient client = UtilHelpers.createHoodieClient(jsc, cfg.basePath, schemaStr, cfg.parallelism, Option.empty(), props)) {
return doSchedule(client);
}
}
private Option doSchedule(SparkRDDWriteClient client) {
if (cfg.clusteringInstantTime != null) {
client.scheduleClusteringAtInstant(cfg.clusteringInstantTime, Option.empty());
return Option.of(cfg.clusteringInstantTime);
}
return client.scheduleClustering(Option.empty());
}
private int doScheduleAndCluster(JavaSparkContext jsc) throws Exception {
LOG.info("Step 1: Do schedule");
metaClient = HoodieTableMetaClient.reload(metaClient);
String schemaStr = UtilHelpers.getSchemaFromLatestInstant(metaClient);
try (SparkRDDWriteClient client = UtilHelpers.createHoodieClient(jsc, cfg.basePath, schemaStr, cfg.parallelism, Option.empty(), props)) {
Option instantTime = Option.empty();
if (cfg.retryLastFailedClusteringJob) {
HoodieSparkTable table = HoodieSparkTable.create(client.getConfig(), client.getEngineContext());
client.validateAgainstTableProperties(table.getMetaClient().getTableConfig(), client.getConfig());
Option lastClusterOpt = table.getActiveTimeline().getLastPendingClusterInstant();
if (lastClusterOpt.isPresent()) {
HoodieInstant inflightClusteringInstant = lastClusterOpt.get();
Date clusteringStartTime = TimelineUtils.parseDateFromInstantTime(inflightClusteringInstant.requestedTime());
if (clusteringStartTime.getTime() + cfg.maxProcessingTimeMs < System.currentTimeMillis()) {
// if there has failed clustering, then we will use the failed clustering instant-time to trigger next clustering action which will rollback and clustering.
LOG.info("Found failed clustering instant at : " + inflightClusteringInstant + "; Will rollback the failed clustering and re-trigger again.");
instantTime = Option.of(inflightClusteringInstant.requestedTime());
} else {
LOG.info(inflightClusteringInstant + " might still be in progress, will trigger a new clustering job.");
}
}
}
instantTime = instantTime.isPresent() ? instantTime : doSchedule(client);
if (!instantTime.isPresent()) {
LOG.info("Couldn't generate cluster plan");
return -1;
}
LOG.info("The schedule instant time is " + instantTime.get());
LOG.info("Step 2: Do cluster");
Option metadata = client.cluster(instantTime.get()).getCommitMetadata();
clean(client);
return UtilHelpers.handleErrors(metadata.get(), instantTime.get());
}
}
private int doPurgePendingInstant(JavaSparkContext jsc) throws Exception {
metaClient = HoodieTableMetaClient.reload(metaClient);
String schemaStr = UtilHelpers.getSchemaFromLatestInstant(metaClient);
try (SparkRDDWriteClient client = UtilHelpers.createHoodieClient(jsc, cfg.basePath, schemaStr, cfg.parallelism, Option.empty(), props)) {
client.purgePendingClustering(cfg.clusteringInstantTime);
}
return 0;
}
private void clean(SparkRDDWriteClient> client) {
if (!cfg.skipClean && client.getConfig().isAutoClean()) {
client.clean();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy