org.apache.beam.runners.dataflow.options.DataflowPipelineWorkerPoolOptions Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.runners.dataflow.options;

import com.fasterxml.jackson.annotation.JsonIgnore;
import java.util.List;
import javax.annotation.Nullable;
import org.apache.beam.runners.dataflow.DataflowRunnerInfo;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.DefaultValueFactory;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.Hidden;
import org.apache.beam.sdk.options.PipelineOptions;

/**
 * Options that are used to configure the Dataflow pipeline worker pool.
 */
@Description("Options that are used to configure the Dataflow pipeline worker pool.")
public interface DataflowPipelineWorkerPoolOptions extends PipelineOptions {
  /**
   * Number of workers to use when executing the Dataflow job. Note that selection of an autoscaling
   * algorithm other then {@code NONE} will affect the size of the worker pool. If left unspecified,
   * the Dataflow service will determine the number of workers.
   */
  @Description("Number of workers to use when executing the Dataflow job. Note that "
      + "selection of an autoscaling algorithm other then \"NONE\" will affect the "
      + "size of the worker pool. If left unspecified, the Dataflow service will "
      + "determine the number of workers.")
  int getNumWorkers();
  void setNumWorkers(int value);

  /**
   * Type of autoscaling algorithm to use.
   */
  @Experimental(Experimental.Kind.AUTOSCALING)
  enum AutoscalingAlgorithmType {
    /** Use numWorkers machines. Do not autoscale the worker pool. */
    NONE("AUTOSCALING_ALGORITHM_NONE"),

    @Deprecated
    BASIC("AUTOSCALING_ALGORITHM_BASIC"),

    /** Autoscale the workerpool based on throughput (up to maxNumWorkers). */
    THROUGHPUT_BASED("AUTOSCALING_ALGORITHM_BASIC");

    private final String algorithm;

    private AutoscalingAlgorithmType(String algorithm) {
      this.algorithm = algorithm;
    }

    /** Returns the string representation of this type. */
    public String getAlgorithm() {
      return this.algorithm;
    }
  }

  /**
   * [Experimental] The autoscaling algorithm to use for the workerpool.
   *
   * 
   *   NONE: does not change the size of the worker pool.
   *   BASIC: autoscale the worker pool size up to maxNumWorkers until the job completes.
   *   THROUGHPUT_BASED: autoscale the workerpool based on throughput (up to maxNumWorkers).
   *   
   * 
   */
  @Description("[Experimental] The autoscaling algorithm to use for the workerpool. "
      + "NONE: does not change the size of the worker pool. "
      + "BASIC (deprecated): autoscale the worker pool size up to maxNumWorkers until the job "
      + "completes. "
      + "THROUGHPUT_BASED: autoscale the workerpool based on throughput (up to maxNumWorkers).")
  @Experimental(Experimental.Kind.AUTOSCALING)
  AutoscalingAlgorithmType getAutoscalingAlgorithm();
  void setAutoscalingAlgorithm(AutoscalingAlgorithmType value);

  /**
   * The maximum number of workers to use for the workerpool. This options limits the size of the
   * workerpool for the lifetime of the job, including
   * pipeline updates.
   * If left unspecified, the Dataflow service will compute a ceiling.
   */
  @Description("The maximum number of workers to use for the workerpool. This options limits the "
      + "size of the workerpool for the lifetime of the job, including pipeline updates. "
      + "If left unspecified, the Dataflow service will compute a ceiling.")
  int getMaxNumWorkers();
  void setMaxNumWorkers(int value);

  /**
   * Remote worker disk size, in gigabytes, or 0 to use the default size.
   */
  @Description("Remote worker disk size, in gigabytes, or 0 to use the default size.")
  int getDiskSizeGb();
  void setDiskSizeGb(int value);

  /**
   * Docker container image that executes Dataflow worker harness, residing in Google Container
   * Registry.
   */
  @Default.InstanceFactory(WorkerHarnessContainerImageFactory.class)
  @Description("Docker container image that executes Dataflow worker harness, residing in Google "
      + " Container Registry.")
  @Hidden
  String getWorkerHarnessContainerImage();
  void setWorkerHarnessContainerImage(String value);

  /**
   * Returns the default Docker container image that executes Dataflow worker harness, residing in
   * Google Container Registry.
   */
  class WorkerHarnessContainerImageFactory
      implements DefaultValueFactory {
    @Override
    public String create(PipelineOptions options) {
      String containerVersion = DataflowRunnerInfo.getDataflowRunnerInfo().getContainerVersion();
      return String.format("dataflow.gcr.io/v1beta3/IMAGE:%s", containerVersion);
    }
  }

  /**
   * GCE network for launching
   * workers.
   *
   * Default is up to the Dataflow service.
   */
  @Description("GCE network for launching workers. For more information, see the reference "
      + "documentation https://cloud.google.com/compute/docs/networking. "
      + "Default is up to the Dataflow service.")
  String getNetwork();
  void setNetwork(String value);

  /**
   * GCE subnetwork for launching
   * workers.
   *
   * 
Default is up to the Dataflow service. Expected format is
   * regions/REGION/subnetworks/SUBNETWORK or the fully qualified subnetwork name, beginning with
   * https://..., e.g. https://www.googleapis.com/compute/alpha/projects/PROJECT/
   *   regions/REGION/subnetworks/SUBNETWORK
   */
  @Description("GCE subnetwork for launching workers. For more information, see the reference "
      + "documentation https://cloud.google.com/compute/docs/networking. "
      + "Default is up to the Dataflow service.")
  String getSubnetwork();
  void setSubnetwork(String value);

  /**
   * GCE availability zone for launching workers.
   *
   * 
Default is up to the Dataflow service.
   */
  @Description("GCE availability zone for launching workers. See "
      + "https://developers.google.com/compute/docs/zones for a list of valid options. "
      + "Default is up to the Dataflow service.")
  String getZone();
  void setZone(String value);

  /**
   * Machine type to create Dataflow worker VMs as.
   *
   * 
See GCE machine types
   * for a list of valid options.
   *
   * 
If unset, the Dataflow service will choose a reasonable default.
   */
  @Description("Machine type to create Dataflow worker VMs as. See "
      + "https://cloud.google.com/compute/docs/machine-types for a list of valid options. "
      + "If unset, the Dataflow service will choose a reasonable default.")
  String getWorkerMachineType();
  void setWorkerMachineType(String value);

  /**
   * List of local files to make available to workers.
   *
   * 
Files are placed on the worker's classpath.
   *
   * 
The default value is the list of jars from the main program's classpath.
   */
  @Description("Files to stage on GCS and make available to workers. "
      + "Files are placed on the worker's classpath. "
      + "The default value is all files from the classpath.")
  @JsonIgnore
  List getFilesToStage();
  void setFilesToStage(List value);

  /**
   * Specifies what type of persistent disk is used. The value is a full disk type resource,
   * e.g., compute.googleapis.com/projects//zones//diskTypes/pd-ssd. For more information,
   * see the API
   * reference documentation for DiskTypes.
   */
  @Description("Specifies what type of persistent disk is used. The "
      + "value is a full URL of a disk type resource, e.g., "
      + "compute.googleapis.com/projects//zones//diskTypes/pd-ssd. For more "
      + "information, see the API reference documentation for DiskTypes: "
      + "https://cloud.google.com/compute/docs/reference/latest/diskTypes")
  String getWorkerDiskType();
  void setWorkerDiskType(String value);

  /**
   * Specifies whether worker pools should be started with public IP addresses.
   *
   * WARNING: This feature is experimental.  You must be whitelisted to use it.
   */
  @Description("Specifies whether worker pools should be started with public IP addresses. WARNING:"
    + "This feature is experimental. You must be whitelisted to use it.")
  @Experimental
  @JsonIgnore
  @Nullable Boolean getUsePublicIps();
  void setUsePublicIps(@Nullable Boolean value);
}