All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.cli.commands.ClusteringCommand Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.cli.commands;

import org.apache.hudi.cli.HoodieCLI;
import org.apache.hudi.cli.commands.SparkMain.SparkCommand;
import org.apache.hudi.cli.utils.InputStreamConsumer;
import org.apache.hudi.cli.utils.SparkUtil;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.utilities.UtilHelpers;

import org.apache.spark.launcher.SparkLauncher;
import org.apache.spark.util.Utils;
import org.springframework.shell.standard.ShellComponent;
import org.springframework.shell.standard.ShellMethod;
import org.springframework.shell.standard.ShellOption;

import scala.collection.JavaConverters;

@ShellComponent
public class ClusteringCommand {

  /**
   * Schedule clustering table service.
   * 

* Example: * > connect --path {path to hudi table} * > clustering schedule --sparkMaster local --sparkMemory 2g */ @ShellMethod(key = "clustering schedule", value = "Schedule Clustering") public String scheduleClustering( @ShellOption(value = "--sparkMaster", defaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master, @ShellOption(value = "--sparkMemory", defaultValue = "1g", help = "Spark executor memory") final String sparkMemory, @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations " + "for hoodie client for clustering", defaultValue = "") final String propsFilePath, @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can " + "be passed here in the form of an array", defaultValue = "") final String[] configs) throws Exception { HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); String sparkPropertiesPath = Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); // First get a clustering instant time and pass it to spark launcher for scheduling clustering String clusteringInstantTime = client.createNewInstantTime(); SparkMain.addAppArgs(sparkLauncher, SparkCommand.CLUSTERING_SCHEDULE, master, sparkMemory, HoodieCLI.basePath, client.getTableConfig().getTableName(), clusteringInstantTime, propsFilePath); UtilHelpers.validateAndAddProperties(configs, sparkLauncher); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to schedule clustering for " + clusteringInstantTime; } return "Succeeded to schedule clustering for " + clusteringInstantTime; } /** * Run clustering table service. *

* Example: * > connect --path {path to hudi table} * > clustering schedule --sparkMaster local --sparkMemory 2g * > clustering run --sparkMaster local --sparkMemory 2g --clusteringInstant 20211124005208 */ @ShellMethod(key = "clustering run", value = "Run Clustering") public String runClustering( @ShellOption(value = "--sparkMaster", defaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master, @ShellOption(value = "--sparkMemory", help = "Spark executor memory", defaultValue = "4g") final String sparkMemory, @ShellOption(value = "--parallelism", help = "Parallelism for hoodie clustering", defaultValue = "1") final String parallelism, @ShellOption(value = "--retry", help = "Number of retries", defaultValue = "1") final String retry, @ShellOption(value = "--clusteringInstant", help = "Clustering instant time", defaultValue = ShellOption.NULL) final String clusteringInstantTime, @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations for " + "hoodie client for compacting", defaultValue = "") final String propsFilePath, @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can be " + "passed here in the form of an array", defaultValue = "") final String[] configs) throws Exception { HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); String sparkPropertiesPath = Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkMain.addAppArgs(sparkLauncher, SparkCommand.CLUSTERING_RUN, master, sparkMemory, HoodieCLI.basePath, client.getTableConfig().getTableName(), clusteringInstantTime, parallelism, retry, propsFilePath); UtilHelpers.validateAndAddProperties(configs, sparkLauncher); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to run clustering for " + clusteringInstantTime; } return "Succeeded to run clustering for " + clusteringInstantTime; } /** * Run clustering table service. *

* Example: * > connect --path {path to hudi table} * > clustering scheduleAndExecute --sparkMaster local --sparkMemory 2g */ @ShellMethod(key = "clustering scheduleAndExecute", value = "Run Clustering. Make a cluster plan first and execute that plan immediately") public String runClustering( @ShellOption(value = "--sparkMaster", defaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master, @ShellOption(value = "--sparkMemory", help = "Spark executor memory", defaultValue = "4g") final String sparkMemory, @ShellOption(value = "--parallelism", help = "Parallelism for hoodie clustering", defaultValue = "1") final String parallelism, @ShellOption(value = "--retry", help = "Number of retries", defaultValue = "1") final String retry, @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations for " + "hoodie client for compacting", defaultValue = "") final String propsFilePath, @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can be " + "passed here in the form of an array", defaultValue = "") final String[] configs) throws Exception { HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); String sparkPropertiesPath = Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkMain.addAppArgs(sparkLauncher, SparkCommand.CLUSTERING_SCHEDULE_AND_EXECUTE, master, sparkMemory, HoodieCLI.basePath, client.getTableConfig().getTableName(), parallelism, retry, propsFilePath); UtilHelpers.validateAndAddProperties(configs, sparkLauncher); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to run clustering for scheduleAndExecute."; } return "Succeeded to run clustering for scheduleAndExecute"; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy