All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flinkx.api.StreamExecutionEnvironment.scala Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.flinkx.api

import com.esotericsoftware.kryo.Serializer
import org.apache.flink.annotation.{Experimental, Internal, Public, PublicEvolving}
import org.apache.flink.api.common.ExecutionConfig.ClosureCleanerLevel
import org.apache.flink.api.common.cache.DistributedCache
import org.apache.flink.api.common.eventtime.WatermarkStrategy
import org.apache.flink.api.common.io.{FileInputFormat, FilePathFilter, InputFormat}
import org.apache.flink.api.common.operators.SlotSharingGroup
import org.apache.flink.api.common.restartstrategy.RestartStrategies.RestartStrategyConfiguration
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.common.{ExecutionConfig, JobExecutionResult, RuntimeExecutionMode}
import org.apache.flink.api.connector.source.lib.NumberSequenceSource
import org.apache.flink.api.connector.source.{Source, SourceSplit}
import org.apache.flink.api.java.tuple
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer
import org.apache.flink.configuration.{Configuration, ReadableConfig}
import org.apache.flink.core.execution.{JobClient, JobListener}
import org.apache.flink.core.fs.Path
import org.apache.flink.runtime.state.StateBackend
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.environment.{CheckpointConfig, StreamExecutionEnvironment => JavaEnv}
import org.apache.flink.streaming.api.functions.source.SourceFunction.SourceContext
import org.apache.flink.streaming.api.functions.source._
import org.apache.flink.streaming.api.graph.StreamGraph
import org.apache.flink.util.{SplittableIterator, TernaryBoolean}
import org.apache.flinkx.api.ScalaStreamOps._

import java.net.URI
import java.util
import scala.jdk.CollectionConverters._
import scala.language.implicitConversions

@Public
class StreamExecutionEnvironment(javaEnv: JavaEnv) {

  /** @return the wrapped Java environment */
  def getJavaEnv: JavaEnv = javaEnv

  /** Gets the config object. */
  def getConfig = javaEnv.getConfig

  /** Gets cache files. */
  def getCachedFiles: util.List[tuple.Tuple2[String, DistributedCache.DistributedCacheEntry]] = javaEnv.getCachedFiles

  /** Gets the config JobListeners. */
  @PublicEvolving
  def getJobListeners: util.List[JobListener] = javaEnv.getJobListeners

  /** Sets the parallelism for operations executed through this environment. Setting a parallelism of x here will cause
    * all operators (such as join, map, reduce) to run with x parallel instances. This value can be overridden by
    * specific operations using [[DataStream#setParallelism(int)]].
    */
  def setParallelism(parallelism: Int): Unit = {
    javaEnv.setParallelism(parallelism)
  }

  /** Sets the runtime execution mode for the application (see [[RuntimeExecutionMode]]). This is equivalent to setting
    * the "execution.runtime-mode" in your application's configuration file.
    *
    * We recommend users to NOT use this method but set the "execution.runtime-mode" using the command-line when
    * submitting the application. Keeping the application code configuration-free allows for more flexibility as the
    * same application will be able to be executed in any execution mode.
    *
    * @param executionMode
    *   the desired execution mode.
    * @return
    *   The execution environment of your application.
    */
  @PublicEvolving
  def setRuntimeMode(executionMode: RuntimeExecutionMode): StreamExecutionEnvironment = {
    javaEnv.setRuntimeMode(executionMode)
    this
  }

  /** Sets the maximum degree of parallelism defined for the program. The maximum degree of parallelism specifies the
    * upper limit for dynamic scaling. It also defines the number of key groups used for partitioned state.
    */
  def setMaxParallelism(maxParallelism: Int): Unit = {
    javaEnv.setMaxParallelism(maxParallelism)
  }

  /** Register a slot sharing group with its resource spec.
    *
    * 

Note that a slot sharing group hints the scheduler that the grouped operators CAN be deployed into a shared * slot. There's no guarantee that the scheduler always deploy the grouped operators together. In cases grouped * operators are deployed into separate slots, the slot resources will be derived from the specified group * requirements. * * @param slotSharingGroup * which contains name and its resource spec. */ @PublicEvolving def registerSlotSharingGroup(slotSharingGroup: SlotSharingGroup): StreamExecutionEnvironment = { javaEnv.registerSlotSharingGroup(slotSharingGroup) this } /** Returns the default parallelism for this execution environment. Note that this value can be overridden by * individual operations using [[DataStream#setParallelism(int)]] */ def getParallelism = javaEnv.getParallelism /** Returns the maximum degree of parallelism defined for the program. * * The maximum degree of parallelism specifies the upper limit for dynamic scaling. It also defines the number of key * groups used for partitioned state. */ def getMaxParallelism: Int = javaEnv.getMaxParallelism /** Sets the maximum time frequency (milliseconds) for the flushing of the output buffers. By default the output * buffers flush frequently to provide low latency and to aid smooth developer experience. Setting the parameter can * result in three logical modes: * *

  • A positive integer triggers flushing periodically by that integer
  • 0 triggers flushing after * every record thus minimizing latency
  • -1 triggers flushing only when the output buffer is full thus * maximizing throughput
*/ def setBufferTimeout(timeoutMillis: Long): StreamExecutionEnvironment = { javaEnv.setBufferTimeout(timeoutMillis) this } /** Gets the default buffer timeout set for this environment */ def getBufferTimeout: Long = javaEnv.getBufferTimeout /** Disables operator chaining for streaming operators. Operator chaining allows non-shuffle operations to be * co-located in the same thread fully avoiding serialization and de-serialization. */ @PublicEvolving def disableOperatorChaining(): StreamExecutionEnvironment = { javaEnv.disableOperatorChaining() this } // ------------------------------------------------------------------------ // Checkpointing Settings // ------------------------------------------------------------------------ /** Gets the checkpoint config, which defines values like checkpoint interval, delay between checkpoints, etc. */ def getCheckpointConfig: CheckpointConfig = javaEnv.getCheckpointConfig /** Enables checkpointing for the streaming job. The distributed state of the streaming dataflow will be periodically * snapshotted. In case of a failure, the streaming dataflow will be restarted from the latest completed checkpoint. * * The job draws checkpoints periodically, in the given interval. The system uses the given [[CheckpointingMode]] for * the checkpointing ("exactly once" vs "at least once"). The state will be stored in the configured state backend. * * NOTE: Checkpointing iterative streaming dataflows in not properly supported at the moment. For that reason, * iterative jobs will not be started if used with enabled checkpointing. To override this mechanism, use the * [[enableCheckpointing(long, CheckpointingMode, boolean)]] method. * * @param interval * Time interval between state checkpoints in milliseconds. * @param mode * The checkpointing mode, selecting between "exactly once" and "at least once" guarantees. */ def enableCheckpointing(interval: Long, mode: CheckpointingMode): StreamExecutionEnvironment = { javaEnv.enableCheckpointing(interval, mode) this } /** Enables checkpointing for the streaming job. The distributed state of the streaming dataflow will be periodically * snapshotted. In case of a failure, the streaming dataflow will be restarted from the latest completed checkpoint. * * The job draws checkpoints periodically, in the given interval. The program will use * [[CheckpointingMode.EXACTLY_ONCE]] mode. The state will be stored in the configured state backend. * * NOTE: Checkpointing iterative streaming dataflows in not properly supported at the moment. For that reason, * iterative jobs will not be started if used with enabled checkpointing. To override this mechanism, use the * [[enableCheckpointing(long, CheckpointingMode, boolean)]] method. * * @param interval * Time interval between state checkpoints in milliseconds. */ def enableCheckpointing(interval: Long): StreamExecutionEnvironment = { enableCheckpointing(interval, CheckpointingMode.EXACTLY_ONCE) } def getCheckpointingMode = javaEnv.getCheckpointingMode /** Sets the state backend that describes how to store operator. It defines the data structures that hold state during * execution (for example hash tables, RocksDB, or other data stores). * * State managed by the state backend includes both keyed state that is accessible on * [[org.apache.flinkx.api.KeyedStream]], as well as state maintained directly by the user code that implements * [[org.apache.flink.streaming.api.checkpoint.CheckpointedFunction]]. * * The [[org.apache.flink.runtime.state.hashmap.HashMapStateBackend]] maintains state in heap memory, as objects. It * is lightweight without extra dependencies, but is limited to JVM heap memory. * * In contrast, the '''EmbeddedRocksDBStateBackend''' stores its state in an embedded '''RocksDB''' instance. This * state backend can store very large state that exceeds memory and spills to local disk. All key/value state * (including windows) is stored in the key/value index of RocksDB. * * In both cases, fault tolerance is managed via the jobs [[org.apache.flink.runtime.state.CheckpointStorage]] which * configures how and where state backends persist during a checkpoint. * * @return * This StreamExecutionEnvironment itself, to allow chaining of function calls. * @see * #getStateBackend() */ @PublicEvolving def setStateBackend(backend: StateBackend): StreamExecutionEnvironment = { javaEnv.setStateBackend(backend) this } /** Returns the state backend that defines how to store and checkpoint state. */ @PublicEvolving def getStateBackend: StateBackend = javaEnv.getStateBackend /** Enable the change log for current state backend. This change log allows operators to persist state changes in a * very fine-grained manner. Currently, the change log only applies to keyed state, so non-keyed operator state and * channel state are persisted as usual. The 'state' here refers to 'keyed state'. Details are as follows: * * Stateful operators write the state changes to that log (logging the state), in addition to applying them to the * state tables in RocksDB or the in-mem Hashtable. * * An operator can acknowledge a checkpoint as soon as the changes in the log have reached the durable checkpoint * storage. * * The state tables are persisted periodically, independent of the checkpoints. We call this the materialization of * the state on the checkpoint storage. * * Once the state is materialized on checkpoint storage, the state changelog can be truncated to the corresponding * point. * * It establish a way to drastically reduce the checkpoint interval for streaming applications across state backends. * For more details please check the FLIP-158. * * If this method is not called explicitly, it means no preference for enabling the change log. Configs for change * log enabling will override in different config levels (job/local/cluster). * * @param enabled * true if enable the change log for state backend explicitly, otherwise disable the change log. * @return * This StreamExecutionEnvironment itself, to allow chaining of function calls. * @see * #isChangelogStateBackendEnabled() */ @PublicEvolving def enableChangelogStateBackend(enabled: Boolean): StreamExecutionEnvironment = { javaEnv.enableChangelogStateBackend(enabled) this } /** Gets the enable status of change log for state backend. * * @return * a [[TernaryBoolean]] for the enable status of change log for state backend. Could be * [[TernaryBoolean#UNDEFINED]] if user never specify this by calling [[enableChangelogStateBackend(boolean)]]. */ @PublicEvolving def isChangelogStateBackendEnabled: TernaryBoolean = javaEnv.isChangelogStateBackendEnabled /** Sets the default savepoint directory, where savepoints will be written to if no is explicitly provided when * triggered. * * @return * This StreamExecutionEnvironment itself, to allow chaining of function calls. * @see * #getDefaultSavepointDirectory() */ @PublicEvolving def setDefaultSavepointDirectory(savepointDirectory: String): StreamExecutionEnvironment = { javaEnv.setDefaultSavepointDirectory(savepointDirectory) this } /** Sets the default savepoint directory, where savepoints will be written to if no is explicitly provided when * triggered. * * @return * This StreamExecutionEnvironment itself, to allow chaining of function calls. * @see * #getDefaultSavepointDirectory() */ @PublicEvolving def setDefaultSavepointDirectory(savepointDirectory: URI): StreamExecutionEnvironment = { javaEnv.setDefaultSavepointDirectory(savepointDirectory) this } /** Sets the default savepoint directory, where savepoints will be written to if no is explicitly provided when * triggered. * * @return * This StreamExecutionEnvironment itself, to allow chaining of function calls. * @see * #getDefaultSavepointDirectory() */ @PublicEvolving def setDefaultSavepointDirectory(savepointDirectory: Path): StreamExecutionEnvironment = { javaEnv.setDefaultSavepointDirectory(savepointDirectory) this } /** Gets the default savepoint directory for this Job. * * @see * #setDefaultSavepointDirectory(Path) */ @PublicEvolving def getDefaultSavepointDirectory: Path = javaEnv.getDefaultSavepointDirectory /** Sets the restart strategy configuration. The configuration specifies which restart strategy will be used for the * execution graph in case of a restart. * * @param restartStrategyConfiguration * Restart strategy configuration to be set */ @PublicEvolving def setRestartStrategy(restartStrategyConfiguration: RestartStrategyConfiguration): Unit = { javaEnv.setRestartStrategy(restartStrategyConfiguration) } /** Returns the specified restart strategy configuration. * * @return * The restart strategy configuration to be used */ @PublicEvolving def getRestartStrategy: RestartStrategyConfiguration = javaEnv.getRestartStrategy // -------------------------------------------------------------------------------------------- // Registry for types and serializers // -------------------------------------------------------------------------------------------- /** Adds a new Kryo default serializer to the Runtime.

Note that the serializer instance must be serializable (as * defined by java.io.Serializable), because it may be distributed to the worker nodes by java serialization. * * @param type * The class of the types serialized with the given serializer. * @param serializer * The serializer to use. */ def addDefaultKryoSerializer[T <: Serializer[_] with Serializable](`type`: Class[_], serializer: T): Unit = { javaEnv.addDefaultKryoSerializer(`type`, serializer) } /** Adds a new Kryo default serializer to the Runtime. * * @param type * The class of the types serialized with the given serializer. * @param serializerClass * The class of the serializer to use. */ def addDefaultKryoSerializer(`type`: Class[_], serializerClass: Class[_ <: Serializer[_]]): Unit = { javaEnv.addDefaultKryoSerializer(`type`, serializerClass) } /** Registers the given type with the serializer at the [[KryoSerializer]]. * * Note that the serializer instance must be serializable (as defined by java.io.Serializable), because it may be * distributed to the worker nodes by java serialization. */ def registerTypeWithKryoSerializer[T <: Serializer[_] with Serializable](clazz: Class[_], serializer: T): Unit = { javaEnv.registerTypeWithKryoSerializer(clazz, serializer) } /** Registers the given type with the serializer at the [[KryoSerializer]]. */ def registerTypeWithKryoSerializer(clazz: Class[_], serializer: Class[_ <: Serializer[_]]): Unit = { javaEnv.registerTypeWithKryoSerializer(clazz, serializer) } /** Registers the given type with the serialization stack. If the type is eventually serialized as a POJO, then the * type is registered with the POJO serializer. If the type ends up being serialized with Kryo, then it will be * registered at Kryo to make sure that only tags are written. */ def registerType(typeClass: Class[_]): Unit = { javaEnv.registerType(typeClass) } /** Sets all relevant options contained in the [[ReadableConfig]] such as e.g. * [[org.apache.flink.streaming.api.environment.StreamPipelineOptions#TIME_CHARACTERISTIC]]. It will reconfigure * [[StreamExecutionEnvironment]], [[org.apache.flink.api.common.ExecutionConfig]] and * [[org.apache.flink.streaming.api.environment.CheckpointConfig]]. * * It will change the value of a setting only if a corresponding option was set in the `configuration`. If a key is * not present, the current value of a field will remain untouched. * * @param configuration * a configuration to read the values from * @param classLoader * a class loader to use when loading classes */ @PublicEvolving def configure(configuration: ReadableConfig, classLoader: ClassLoader): Unit = { javaEnv.configure(configuration, classLoader) } /** Sets all relevant options contained in the [[ReadableConfig]] such as e.g. * [[org.apache.flink.streaming.api.environment.StreamPipelineOptions#TIME_CHARACTERISTIC]]. It will reconfigure * [[StreamExecutionEnvironment]], [[org.apache.flink.api.common.ExecutionConfig]] and * [[org.apache.flink.streaming.api.environment.CheckpointConfig]]. * * It will change the value of a setting only if a corresponding option was set in the `configuration`. If a key is * not present, the current value of a field will remain untouched. * * @param configuration * a configuration to read the values from */ @PublicEvolving def configure(configuration: ReadableConfig): Unit = { javaEnv.configure(configuration) } // -------------------------------------------------------------------------------------------- // Data stream creations // -------------------------------------------------------------------------------------------- /** Creates a new data stream that contains a sequence of numbers (longs) and is useful for testing and for cases that * just need a stream of N events of any kind. * * The generated source splits the sequence into as many parallel sub-sequences as there are parallel source readers. * Each sub-sequence will be produced in order. If the parallelism is limited to one, the source will produce one * sequence in order. * * This source is always bounded. For very long sequences (for example over the entire domain of long integer * values), you may consider executing the application in a streaming manner because of the end bound that is pretty * far away. * * Use [[fromSource(Source,WatermarkStrategy, String)]] together with [[NumberSequenceSource]] if you required more * control over the created sources. For example, if you want to set a [[WatermarkStrategy]]. */ def fromSequence(from: Long, to: Long): DataStream[Long] = { new DataStream[java.lang.Long](javaEnv.fromSequence(from, to)) .asInstanceOf[DataStream[Long]] } /** Creates a DataStream that contains the given elements. The elements must all be of the same type. * * Note that this operation will result in a non-parallel data source, i.e. a data source with a parallelism of one. */ def fromElements[T: TypeInformation](data: T*): DataStream[T] = { fromCollection(data) } /** Creates a DataStream from the given non-empty [[Seq]]. The elements need to be serializable because the framework * may move the elements into the cluster if needed. * * Note that this operation will result in a non-parallel data source, i.e. a data source with a parallelism of one. */ def fromCollection[T: TypeInformation](data: Seq[T]): DataStream[T] = { require(data != null, "Data must not be null.") val typeInfo = implicitly[TypeInformation[T]] val collection = data.asJavaCollection asScalaStream(javaEnv.fromCollection(collection, typeInfo)) } /** Creates a DataStream from the given [[Iterator]]. * * Note that this operation will result in a non-parallel data source, i.e. a data source with a parallelism of one. */ def fromCollection[T: TypeInformation](data: Iterator[T]): DataStream[T] = { val typeInfo = implicitly[TypeInformation[T]] asScalaStream(javaEnv.fromCollection(data.asJava, typeInfo)) } /** Creates a DataStream from the given [[SplittableIterator]]. */ def fromParallelCollection[T: TypeInformation](data: SplittableIterator[T]): DataStream[T] = { val typeInfo = implicitly[TypeInformation[T]] asScalaStream(javaEnv.fromParallelCollection(data, typeInfo)) } /** Creates a DataStream that represents the Strings produced by reading the given file line wise. The file will be * read with the system's default character set. */ def readTextFile(filePath: String): DataStream[String] = asScalaStream(javaEnv.readTextFile(filePath)) /** Creates a data stream that represents the Strings produced by reading the given file line wise. The character set * with the given name will be used to read the files. */ def readTextFile(filePath: String, charsetName: String): DataStream[String] = asScalaStream(javaEnv.readTextFile(filePath, charsetName)) /** Reads the given file with the given input format. The file path should be passed as a URI (e.g., * "file:///some/local/file" or "hdfs://host:port/file/path"). */ def readFile[T: TypeInformation](inputFormat: FileInputFormat[T], filePath: String): DataStream[T] = asScalaStream(javaEnv.readFile(inputFormat, filePath)) /** Reads the contents of the user-specified path based on the given [[FileInputFormat]]. Depending on the provided * [[FileProcessingMode]], the source may periodically monitor (every `interval` ms) the path for new data * ([[FileProcessingMode.PROCESS_CONTINUOUSLY]]), or process once the data currently in the path and exit * ([[FileProcessingMode.PROCESS_ONCE]]). In addition, if the path contains files not to be processed, the user can * specify a custom [[FilePathFilter]]. As a default implementation you can use * [[FilePathFilter.createDefaultFilter()]]. * * ** NOTES ON CHECKPOINTING: ** If the `watchType` is set to [[FileProcessingMode#PROCESS_ONCE]], the source * monitors the path ** once **, creates the [[org.apache.flink.core.fs.FileInputSplit FileInputSplits]] to be * processed, forwards them to the downstream [[ContinuousFileReaderOperator readers]] to read the actual data, and * exits, without waiting for the readers to finish reading. This implies that no more checkpoint barriers are going * to be forwarded after the source exits, thus having no checkpoints after that point. * * @param inputFormat * The input format used to create the data stream * @param filePath * The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path") * @param watchType * The mode in which the source should operate, i.e. monitor path and react to new data, or process once and exit * @param interval * In the case of periodic path monitoring, this specifies the interval (in millis) between consecutive path scans * @return * The data stream that represents the data read from the given file */ @PublicEvolving def readFile[T: TypeInformation]( inputFormat: FileInputFormat[T], filePath: String, watchType: FileProcessingMode, interval: Long ): DataStream[T] = { val typeInfo = implicitly[TypeInformation[T]] asScalaStream(javaEnv.readFile(inputFormat, filePath, watchType, interval, typeInfo)) } /** Creates a new DataStream that contains the strings received infinitely from socket. Received strings are decoded * by the system's default character set. The maximum retry interval is specified in seconds, in case of temporary * service outage reconnection is initiated every second. */ @PublicEvolving def socketTextStream(hostname: String, port: Int, delimiter: Char = '\n', maxRetry: Long = 0): DataStream[String] = asScalaStream(javaEnv.socketTextStream(hostname, port)) /** Generic method to create an input data stream with a specific input format. Since all data streams need specific * information about their types, this method needs to determine the type of the data produced by the input format. * It will attempt to determine the data type by reflection, unless the input format implements the * ResultTypeQueryable interface. */ @PublicEvolving def createInput[T: TypeInformation](inputFormat: InputFormat[T, _]): DataStream[T] = if (inputFormat.isInstanceOf[ResultTypeQueryable[_]]) { asScalaStream(javaEnv.createInput(inputFormat)) } else { asScalaStream(javaEnv.createInput(inputFormat, implicitly[TypeInformation[T]])) } /** Create a DataStream using a user defined source function for arbitrary source functionality. By default sources * have a parallelism of 1. To enable parallel execution, the user defined source should implement * ParallelSourceFunction or extend RichParallelSourceFunction. In these cases the resulting source will have the * parallelism of the environment. To change this afterwards call DataStreamSource.setParallelism(int) */ def addSource[T: TypeInformation](function: SourceFunction[T]): DataStream[T] = { require(function != null, "Function must not be null.") val cleanFun = scalaClean(function) val typeInfo = implicitly[TypeInformation[T]] asScalaStream(javaEnv.addSource(cleanFun, typeInfo)) } /** Create a DataStream using a user defined source function for arbitrary source functionality. */ def addSource[T: TypeInformation](function: SourceContext[T] => Unit): DataStream[T] = { require(function != null, "Function must not be null.") val sourceFunction = new SourceFunction[T] { val cleanFun = scalaClean(function) override def run(ctx: SourceContext[T]): Unit = { cleanFun(ctx) } override def cancel(): Unit = {} } addSource(sourceFunction) } /** Create a DataStream using a [[Source]]. */ @Experimental def fromSource[T: TypeInformation]( source: Source[T, _ <: SourceSplit, _], watermarkStrategy: WatermarkStrategy[T], sourceName: String ): DataStream[T] = { val typeInfo = implicitly[TypeInformation[T]] asScalaStream(javaEnv.fromSource(source, watermarkStrategy, sourceName, typeInfo)) } /** Triggers the program execution. The environment will execute all parts of the program that have resulted in a * "sink" operation. Sink operations are for example printing results or forwarding them to a message queue. * * The program execution will be logged and displayed with a generated default name. * * @return * The result of the job execution, containing elapsed time and accumulators. */ def execute(): JobExecutionResult = javaEnv.execute() /** Triggers the program execution. The environment will execute all parts of the program that have resulted in a * "sink" operation. Sink operations are for example printing results or forwarding them to a message queue. * * The program execution will be logged and displayed with the provided name. * * @return * The result of the job execution, containing elapsed time and accumulators. */ def execute(jobName: String): JobExecutionResult = javaEnv.execute(jobName) /** Register a [[JobListener]] in this environment. The [[JobListener]] will be notified on specific job status * changed. */ @PublicEvolving def registerJobListener(jobListener: JobListener): Unit = { javaEnv.registerJobListener(jobListener) } /** Clear all registered [[JobListener]]s. */ @PublicEvolving def clearJobListeners(): Unit = { javaEnv.clearJobListeners() } /** Triggers the program execution asynchronously. The environment will execute all parts of the program that have * resulted in a "sink" operation. Sink operations are for example printing results or forwarding them to a message * queue. * * The program execution will be logged and displayed with a generated default name. * * ATTENTION: The caller of this method is responsible for managing the lifecycle of the returned * [[JobClient]]. This means calling [[JobClient#close()]] at the end of its usage. In other case, there may be * resource leaks depending on the JobClient implementation. * * @return * A [[JobClient]] that can be used to communicate with the submitted job, completed on submission succeeded. */ @PublicEvolving def executeAsync(): JobClient = javaEnv.executeAsync() /** Triggers the program execution asynchronously. The environment will execute all parts of the program that have * resulted in a "sink" operation. Sink operations are for example printing results or forwarding them to a message * queue. * * The program execution will be logged and displayed with the provided name. * * ATTENTION: The caller of this method is responsible for managing the lifecycle of the returned * [[JobClient]]. This means calling [[JobClient#close()]] at the end of its usage. In other case, there may be * resource leaks depending on the JobClient implementation. * * @return * A [[JobClient]] that can be used to communicate with the submitted job, completed on submission succeeded. */ @PublicEvolving def executeAsync(jobName: String): JobClient = javaEnv.executeAsync(jobName) /** Creates the plan with which the system will execute the program, and returns it as a String using a JSON * representation of the execution data flow graph. Note that this needs to be called, before the plan is executed. */ def getExecutionPlan: String = javaEnv.getExecutionPlan /** Getter of the [[org.apache.flink.streaming.api.graph.StreamGraph]] of the streaming job. This call clears * previously registered [[org.apache.flink.api.dag.Transformation transformations]]. * * @return * The StreamGraph representing the transformations */ @Internal def getStreamGraph: StreamGraph = javaEnv.getStreamGraph /** Getter of the [[org.apache.flink.streaming.api.graph.StreamGraph]] of the streaming job with the option to clear * previously registered [[org.apache.flink.api.dag.Transformation transformations]]. Clearing the transformations * allows, for example, to not re-execute the same operations when calling [[execute()]] multiple times. * * @param clearTransformations * Whether or not to clear previously registered transformations * @return * The StreamGraph representing the transformations */ @Internal def getStreamGraph(clearTransformations: Boolean): StreamGraph = { javaEnv.getStreamGraph(clearTransformations) } /** Gives read-only access to the underlying configuration of this environment. * * Note that the returned configuration might not be complete. It only contains options that have initialized the * environment or options that are not represented in dedicated configuration classes such as [[ExecutionConfig]] or * [[CheckpointConfig]]. * * Use [[configure]] to set options that are specific to this environment. */ @Internal def getConfiguration: ReadableConfig = javaEnv.getConfiguration /** Getter of the wrapped [[org.apache.flink.streaming.api.environment.StreamExecutionEnvironment]] * * @return * The encased ExecutionEnvironment */ @Internal def getWrappedStreamExecutionEnvironment: JavaEnv = javaEnv /** Returns a "closure-cleaned" version of the given function. Cleans only if closure cleaning is not disabled in the * [[org.apache.flink.api.common.ExecutionConfig]] */ private[flinkx] def scalaClean[F <: AnyRef](f: F): F = { if (getConfig.isClosureCleanerEnabled) { ClosureCleaner.scalaClean(f, checkSerializable = true, cleanTransitively = getConfig.getClosureCleanerLevel == ClosureCleanerLevel.RECURSIVE) } else { ClosureCleaner.ensureSerializable(f) } f } /** Registers a file at the distributed cache under the given name. The file will be accessible from any user-defined * function in the (distributed) runtime under a local path. Files may be local files (which will be distributed via * BlobServer), or files in a distributed file system. The runtime will copy the files temporarily to a local cache, * if needed. * * The [[org.apache.flink.api.common.functions.RuntimeContext]] can be obtained inside UDFs via * [[org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()]] and provides access * [[org.apache.flink.api.common.cache.DistributedCache]] via * [[org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()]]. * * @param filePath * The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path") * @param name * The name under which the file is registered. */ def registerCachedFile(filePath: String, name: String): Unit = { javaEnv.registerCachedFile(filePath, name) } /** Registers a file at the distributed cache under the given name. The file will be accessible from any user-defined * function in the (distributed) runtime under a local path. Files may be local files (which will be distributed via * BlobServer), or files in a distributed file system. The runtime will copy the files temporarily to a local cache, * if needed. * * The [[org.apache.flink.api.common.functions.RuntimeContext]] can be obtained inside UDFs via * [[org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()]] and provides access * [[org.apache.flink.api.common.cache.DistributedCache]] via * [[org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()]]. * * @param filePath * The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path") * @param name * The name under which the file is registered. * @param executable * flag indicating whether the file should be executable */ def registerCachedFile(filePath: String, name: String, executable: Boolean): Unit = { javaEnv.registerCachedFile(filePath, name, executable) } /** Returns whether Unaligned Checkpoints are enabled. */ def isUnalignedCheckpointsEnabled: Boolean = javaEnv.isUnalignedCheckpointsEnabled /** Returns whether Unaligned Checkpoints are force-enabled. */ def isForceUnalignedCheckpoints: Boolean = javaEnv.isForceUnalignedCheckpoints } object StreamExecutionEnvironment { /** Sets the default parallelism that will be used for the local execution environment created by * [[createLocalEnvironment()]]. * * @param parallelism * The default parallelism to use for local execution. */ @PublicEvolving def setDefaultLocalParallelism(parallelism: Int): Unit = JavaEnv.setDefaultLocalParallelism(parallelism) /** Gets the default parallelism that will be used for the local execution environment created by * [[createLocalEnvironment()]]. */ @PublicEvolving def getDefaultLocalParallelism: Int = JavaEnv.getDefaultLocalParallelism // -------------------------------------------------------------------------- // context environment // -------------------------------------------------------------------------- /** Creates an execution environment that represents the context in which the program is currently executed. If the * program is invoked standalone, this method returns a local execution environment. If the program is invoked from * within the command line client to be submitted to a cluster, this method returns the execution environment of this * cluster. */ def getExecutionEnvironment: StreamExecutionEnvironment = { new StreamExecutionEnvironment(JavaEnv.getExecutionEnvironment) } // -------------------------------------------------------------------------- // local environment // -------------------------------------------------------------------------- /** Creates a local execution environment. The local execution environment will run the program in a multi-threaded * fashion in the same JVM as the environment was created in. * * This method sets the environment's default parallelism to given parameter, which defaults to the value set via * [[setDefaultLocalParallelism(Int)]]. */ def createLocalEnvironment(parallelism: Int = JavaEnv.getDefaultLocalParallelism): StreamExecutionEnvironment = { new StreamExecutionEnvironment(JavaEnv.createLocalEnvironment(parallelism)) } /** Creates a local execution environment. The local execution environment will run the program in a multi-threaded * fashion in the same JVM as the environment was created in. * * @param parallelism * The parallelism for the local environment. * @param configuration * Pass a custom configuration into the cluster. */ def createLocalEnvironment(parallelism: Int, configuration: Configuration): StreamExecutionEnvironment = { new StreamExecutionEnvironment(JavaEnv.createLocalEnvironment(parallelism, configuration)) } /** Creates a [[StreamExecutionEnvironment]] for local program execution that also starts the web monitoring UI. * * The local execution environment will run the program in a multi-threaded fashion in the same JVM as the * environment was created in. It will use the parallelism specified in the parameter. * * If the configuration key 'rest.port' was set in the configuration, that particular port will be used for the web * UI. Otherwise, the default port (8081) will be used. * * @param config * optional config for the local execution * @return * The created StreamExecutionEnvironment */ @PublicEvolving def createLocalEnvironmentWithWebUI(config: Configuration = null): StreamExecutionEnvironment = { val conf: Configuration = if (config == null) new Configuration() else config new StreamExecutionEnvironment(JavaEnv.createLocalEnvironmentWithWebUI(conf)) } // -------------------------------------------------------------------------- // remote environment // -------------------------------------------------------------------------- /** Creates a remote execution environment. The remote environment sends (parts of) the program to a cluster for * execution. Note that all file paths used in the program must be accessible from the cluster. The execution will * use the cluster's default parallelism, unless the parallelism is set explicitly via * [[StreamExecutionEnvironment.setParallelism()]]. * * @param host * The host name or address of the master (JobManager), where the program should be executed. * @param port * The port of the master (JobManager), where the program should be executed. * @param jarFiles * The JAR files with code that needs to be shipped to the cluster. If the program uses user-defined functions, * user-defined input formats, or any libraries, those must be provided in the JAR files. */ def createRemoteEnvironment(host: String, port: Int, jarFiles: String*): StreamExecutionEnvironment = { new StreamExecutionEnvironment(JavaEnv.createRemoteEnvironment(host, port, jarFiles: _*)) } /** Creates a remote execution environment. The remote environment sends (parts of) the program to a cluster for * execution. Note that all file paths used in the program must be accessible from the cluster. The execution will * use the specified parallelism. * * @param host * The host name or address of the master (JobManager), where the program should be executed. * @param port * The port of the master (JobManager), where the program should be executed. * @param parallelism * The parallelism to use during the execution. * @param jarFiles * The JAR files with code that needs to be shipped to the cluster. If the program uses user-defined functions, * user-defined input formats, or any libraries, those must be provided in the JAR files. */ def createRemoteEnvironment( host: String, port: Int, parallelism: Int, jarFiles: String* ): StreamExecutionEnvironment = { val javaEnv = JavaEnv.createRemoteEnvironment(host, port, jarFiles: _*) javaEnv.setParallelism(parallelism) new StreamExecutionEnvironment(javaEnv) } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy