All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.utilities.multitable.HoodieMultiTableServicesMain Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.utilities.multitable;

import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.table.action.compact.strategy.LogFileSizeBasedCompactionStrategy;
import org.apache.hudi.utilities.HoodieCompactor;
import org.apache.hudi.utilities.IdentitySplitter;
import org.apache.hudi.utilities.UtilHelpers;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaSparkContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.StringJoiner;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

/**
 * Main function for executing multi-table services.
 */
public class HoodieMultiTableServicesMain {
  private static final Logger LOG = LoggerFactory.getLogger(HoodieMultiTableServicesMain.class);
  final Config cfg;
  final TypedProperties props;

  private final JavaSparkContext jsc;

  private ScheduledExecutorService executorService;

  private void batchRunTableServices(List tablePaths) throws InterruptedException, ExecutionException {
    ExecutorService executorService = Executors.newFixedThreadPool(cfg.poolSize);
    List> futures = tablePaths.stream()
        .map(basePath -> CompletableFuture.runAsync(
            () -> MultiTableServiceUtils.buildTableServicePipeline(jsc, basePath, cfg, props).execute(),
            executorService))
        .collect(Collectors.toList());
    CompletableFuture allComplete =
        CompletableFuture.allOf(futures.toArray(new CompletableFuture[0]));
    CompletableFuture anyException = new CompletableFuture<>();
    for (CompletableFuture future : futures) {
      future.exceptionally((e) -> {
        anyException.completeExceptionally(e);
        return null;
      });
    }
    try {
      CompletableFuture.anyOf(allComplete, anyException).get();
    } catch (ExecutionException ee) {
      throw new ExecutionException("some table service failed", ee);
    } finally {
      executorService.shutdownNow();
    }
  }

  private void streamRunTableServices(List tablePaths) throws InterruptedException {
    executorService = Executors.newScheduledThreadPool(cfg.poolSize);
    for (String tablePath : tablePaths) {
      TableServicePipeline pipeline = MultiTableServiceUtils.buildTableServicePipeline(jsc, tablePath, cfg, props);
      executorService.scheduleAtFixedRate(pipeline::execute, 0, cfg.scheduleDelay, TimeUnit.MILLISECONDS);
    }
    executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.MINUTES);
  }

  public void cancel() {
    if (executorService != null) {
      executorService.shutdown();
    }
  }

  public HoodieMultiTableServicesMain(JavaSparkContext jsc, Config cfg) {
    this.cfg = cfg;
    this.jsc = jsc;
    this.props = cfg.propsFilePath == null ? UtilHelpers.buildProperties(cfg.configs) : readConfigFromFileSystem(jsc, cfg);
  }

  public void startServices() throws ExecutionException, InterruptedException {
    LOG.info("StartServices Config: " + cfg);
    List tablePaths;
    if (cfg.autoDiscovery) {
      // We support defining multi base paths
      tablePaths = cfg.basePath.stream()
          .filter(this::pathExists)
          .flatMap(p -> MultiTableServiceUtils.findHoodieTablesUnderPath(jsc, p).stream())
          .collect(Collectors.toList());
    } else {
      tablePaths = MultiTableServiceUtils.getTablesToBeServedFromProps(jsc, props);
    }
    LOG.info("All table paths: " + String.join(",", tablePaths));
    if (cfg.batch) {
      batchRunTableServices(tablePaths);
    } else {
      streamRunTableServices(tablePaths);
    }
  }

  private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) {
    return UtilHelpers.readConfig(jsc.hadoopConfiguration(), new Path(cfg.propsFilePath), cfg.configs).getProps(true);
  }

  private boolean pathExists(String path) {
    try {
      Path p = new Path(path);
      FileSystem fs = p.getFileSystem(jsc.hadoopConfiguration());
      return fs.exists(p);
    } catch (IOException e) {
      throw new HoodieIOException("Error checking path existing:", e);
    }
  }

  /**
   * Command line configs to run table services
   */
  public static class Config implements Serializable {
    @Parameter(names = {"--base-path"}, description = "Base path for all the tables, this can be repeated",
        required = true, splitter = IdentitySplitter.class)
    public List basePath = Collections.emptyList();

    @Parameter(names = {"--auto-discovery", "-a"}, description = "Whether to discover hudi tables in the base path")
    public boolean autoDiscovery = false;

    @Parameter(names = {"--parallelism"}, description = "Parallelism for hoodie table service")
    public int parallelism = 200;

    @Parameter(names = {"--batch", "-b"}, description = "Run services in batch or streaming mode")
    public boolean batch = false;

    @Parameter(names = {"--schedule-delay", "-d"}, description = "Table services schedule delay")
    public int scheduleDelay = 2000;

    @Parameter(names = {"--retry", "-r"}, description = "Table service retry count")
    public int retry = 1;

    @Parameter(names = {"--poolSize", "-p"}, description = "thread pool size")
    public int poolSize = Runtime.getRuntime().availableProcessors();

    @Parameter(names = {"--name", "-n"}, description = "Spark APP name")
    public String appName = "Hudi Table Service";

    @Parameter(names = {"--help", "-h"}, help = true)
    public Boolean help = false;

    @Parameter(names = {"--enable-compaction"}, help = true)
    public Boolean enableCompaction = false;

    @Parameter(names = {"--enable-clustering"}, help = true)
    public Boolean enableClustering = false;

    @Parameter(names = {"--enable-clean"}, help = true)
    public Boolean enableClean = false;

    @Parameter(names = {"--enable-archive"}, help = true)
    public Boolean enableArchive = false;

    @Parameter(names = {"--compaction-mode"}, description = "Set job mode: Set \"schedule\" means make a compact plan; "
        + "Set \"execute\" means execute a compact plan at given instant which means --instant-time is needed here; "
        + "Set \"scheduleAndExecute\" means make a compact plan first and execute that plan immediately")
    public String compactionRunningMode = HoodieCompactor.EXECUTE;

    @Parameter(names = {"--strategy", "-st"}, description = "Strategy Class")
    public String compactionStrategyClassName = LogFileSizeBasedCompactionStrategy.class.getName();

    @Parameter(names = {"--clustering-mode"}, description = "Set job mode: Set \"schedule\" means make a clustering plan; "
        + "Set \"execute\" means execute a clustering plan at given instant which means --instant-time is needed here; "
        + "Set \"scheduleAndExecute\" means make a clustering plan first and execute that plan immediately")
    public String clusteringRunningMode = HoodieCompactor.SCHEDULE_AND_EXECUTE;

    @Parameter(names = {"--spark-master", "-ms"}, description = "Spark master")
    public String sparkMaster;

    @Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use")
    public String sparkMemory = null;

    @Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
        + "hoodie client for table service")
    public String propsFilePath = null;

    @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file "
        + "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated",
        splitter = IdentitySplitter.class)
    public List configs = new ArrayList<>();

    @Override
    public String toString() {
      return new StringJoiner(", ", Config.class.getSimpleName() + "[", "]")
          .add("basePath=" + basePath)
          .add("autoDiscovery=" + autoDiscovery)
          .add("parallelism=" + parallelism)
          .add("batch=" + batch)
          .add("scheduleDelay=" + scheduleDelay)
          .add("retry=" + retry)
          .add("poolSize=" + poolSize)
          .add("appName='" + appName + "'")
          .add("help=" + help)
          .add("enableCompaction=" + enableCompaction)
          .add("enableClustering=" + enableClustering)
          .add("enableClean=" + enableClean)
          .add("enableArchive=" + enableArchive)
          .add("compactionRunningMode='" + compactionRunningMode + "'")
          .add("compactionStrategyClassName='" + compactionStrategyClassName + "'")
          .add("clusteringRunningMode='" + clusteringRunningMode + "'")
          .add("sparkMaster='" + sparkMaster + "'")
          .add("sparkMemory='" + sparkMemory + "'")
          .add("propsFilePath='" + propsFilePath + "'")
          .add("configs=" + configs)
          .toString();
    }
  }

  public static void main(String[] args) {
    final HoodieMultiTableServicesMain.Config cfg = new HoodieMultiTableServicesMain.Config();
    JCommander cmd = new JCommander(cfg, null, args);
    if (cfg.help || args.length == 0) {
      cmd.usage();
      System.exit(1);
    }
    JavaSparkContext jsc = UtilHelpers.buildSparkContext(cfg.appName, cfg.sparkMaster, cfg.sparkMemory);
    try {
      new HoodieMultiTableServicesMain(jsc, cfg).startServices();
    } catch (Throwable throwable) {
      LOG.error("Fail to run table services, ", throwable);
    } finally {
      jsc.stop();
    }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy