org.apache.hudi.cli.commands.ClusteringCommand Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.cli.commands;

import org.apache.hudi.cli.HoodieCLI;
import org.apache.hudi.cli.commands.SparkMain.SparkCommand;
import org.apache.hudi.cli.utils.InputStreamConsumer;
import org.apache.hudi.cli.utils.SparkUtil;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.utilities.UtilHelpers;

import org.apache.spark.launcher.SparkLauncher;
import org.apache.spark.util.Utils;
import org.springframework.shell.standard.ShellComponent;
import org.springframework.shell.standard.ShellMethod;
import org.springframework.shell.standard.ShellOption;

import scala.collection.JavaConverters;

@ShellComponent
public class ClusteringCommand {

  /**
   * Schedule clustering table service.
   * 
   * Example:
   * > connect --path {path to hudi table}
   * > clustering schedule --sparkMaster local --sparkMemory 2g
   */
  @ShellMethod(key = "clustering schedule", value = "Schedule Clustering")
  public String scheduleClustering(
      @ShellOption(value = "--sparkMaster", defaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master,
      @ShellOption(value = "--sparkMemory", defaultValue = "1g", help = "Spark executor memory") final String sparkMemory,
      @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations "
          + "for hoodie client for clustering", defaultValue = "") final String propsFilePath,
      @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can "
          + "be passed here in the form of an array", defaultValue = "") final String[] configs) throws Exception {
    HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);

    String sparkPropertiesPath =
        Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);

    // First get a clustering instant time and pass it to spark launcher for scheduling clustering
    String clusteringInstantTime = client.createNewInstantTime();

    SparkMain.addAppArgs(sparkLauncher, SparkCommand.CLUSTERING_SCHEDULE, master, sparkMemory,
        HoodieCLI.basePath, client.getTableConfig().getTableName(), clusteringInstantTime, propsFilePath);
    UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to schedule clustering for " + clusteringInstantTime;
    }
    return "Succeeded to schedule clustering for " + clusteringInstantTime;
  }

  /**
   * Run clustering table service.
   * 

   * Example:
   * > connect --path {path to hudi table}
   * > clustering schedule --sparkMaster local --sparkMemory 2g
   * > clustering run --sparkMaster local --sparkMemory 2g --clusteringInstant  20211124005208
   */
  @ShellMethod(key = "clustering run", value = "Run Clustering")
  public String runClustering(
      @ShellOption(value = "--sparkMaster", defaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master,
      @ShellOption(value = "--sparkMemory", help = "Spark executor memory", defaultValue = "4g") final String sparkMemory,
      @ShellOption(value = "--parallelism", help = "Parallelism for hoodie clustering", defaultValue = "1") final String parallelism,
      @ShellOption(value = "--retry", help = "Number of retries", defaultValue = "1") final String retry,
      @ShellOption(value = "--clusteringInstant", help = "Clustering instant time",
          defaultValue = ShellOption.NULL) final String clusteringInstantTime,
      @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations for "
          + "hoodie client for compacting", defaultValue = "") final String propsFilePath,
      @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can be "
          + "passed here in the form of an array", defaultValue = "") final String[] configs) throws Exception {
    HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);

    String sparkPropertiesPath =
        Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    SparkMain.addAppArgs(sparkLauncher, SparkCommand.CLUSTERING_RUN, master, sparkMemory,
        HoodieCLI.basePath, client.getTableConfig().getTableName(), clusteringInstantTime,
        parallelism, retry, propsFilePath);
    UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to run clustering for " + clusteringInstantTime;
    }
    return "Succeeded to run clustering for " + clusteringInstantTime;
  }

  /**
   * Run clustering table service.
   * 
   * Example:
   * > connect --path {path to hudi table}
   * > clustering scheduleAndExecute --sparkMaster local --sparkMemory 2g
   */
  @ShellMethod(key = "clustering scheduleAndExecute", value = "Run Clustering. Make a cluster plan first and execute that plan immediately")
  public String runClustering(
      @ShellOption(value = "--sparkMaster", defaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master,
      @ShellOption(value = "--sparkMemory", help = "Spark executor memory", defaultValue = "4g") final String sparkMemory,
      @ShellOption(value = "--parallelism", help = "Parallelism for hoodie clustering", defaultValue = "1") final String parallelism,
      @ShellOption(value = "--retry", help = "Number of retries", defaultValue = "1") final String retry,
      @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations for "
          + "hoodie client for compacting", defaultValue = "") final String propsFilePath,
      @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can be "
          + "passed here in the form of an array", defaultValue = "") final String[] configs) throws Exception {
    HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);

    String sparkPropertiesPath =
        Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    SparkMain.addAppArgs(sparkLauncher, SparkCommand.CLUSTERING_SCHEDULE_AND_EXECUTE, master, sparkMemory,
        HoodieCLI.basePath, client.getTableConfig().getTableName(), parallelism, retry, propsFilePath);
    UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to run clustering for scheduleAndExecute.";
    }
    return "Succeeded to run clustering for scheduleAndExecute";
  }
}