org.apache.flink.streaming.api.scala.StreamExecutionEnvironment.scala Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.streaming.api.scala
import com.esotericsoftware.kryo.Serializer
import org.apache.flink.annotation.{Internal, Public, PublicEvolving}
import org.apache.flink.api.common.{JobID, JobSubmissionResult}
import org.apache.flink.api.common.io.{FileInputFormat, FilePathFilter, InputFormat}
import org.apache.flink.api.common.operators.ResourceSpec
import org.apache.flink.api.common.restartstrategy.RestartStrategies.RestartStrategyConfiguration
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.JobListener
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer
import org.apache.flink.api.scala.ClosureCleaner
import org.apache.flink.configuration.Configuration
import org.apache.flink.runtime.jobgraph.SavepointRestoreSettings
import org.apache.flink.runtime.state.AbstractStateBackend
import org.apache.flink.runtime.state.StateBackend
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.JobType
import org.apache.flink.streaming.api.environment.{StreamExecutionEnvironment => JavaEnv}
import org.apache.flink.streaming.api.functions.source._
import org.apache.flink.streaming.api.functions.source.SourceFunction.SourceContext
import org.apache.flink.streaming.api.graph.StreamGraph
import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic}
import org.apache.flink.util.SplittableIterator
import scala.collection.JavaConverters._
import _root_.scala.language.implicitConversions
@Public
class StreamExecutionEnvironment(javaEnv: JavaEnv) {
/**
* @return the wrapped Java environment
*/
def getJavaEnv: JavaEnv = javaEnv
/**
* Gets the config object.
*/
def getConfig = javaEnv.getConfig
/**
* Gets cache files.
*/
def getCachedFiles = javaEnv.getCachedFiles
/**
* Sets the parallelism for operations executed through this environment.
* Setting a parallelism of x here will cause all operators (such as join, map, reduce) to run
* with x parallel instances. This value can be overridden by specific operations using
* [[DataStream#setParallelism(int)]].
*/
def setParallelism(parallelism: Int): Unit = {
javaEnv.setParallelism(parallelism)
}
/**
* Sets the maximum degree of parallelism defined for the program.
* The maximum degree of parallelism specifies the upper limit for dynamic scaling. It also
* defines the number of key groups used for partitioned state.
**/
def setMaxParallelism(maxParallelism: Int): Unit = {
javaEnv.setMaxParallelism(maxParallelism)
}
/**
* Returns the default parallelism for this execution environment. Note that this
* value can be overridden by individual operations using [[DataStream#setParallelism(int)]]
*/
def getParallelism = javaEnv.getParallelism
/**
* Returns the maximum degree of parallelism defined for the program.
*
* The maximum degree of parallelism specifies the upper limit for dynamic scaling. It also
* defines the number of key groups used for partitioned state.
*
*/
def getMaxParallelism = javaEnv.getMaxParallelism
/**
* Sets the maximum time frequency (milliseconds) for the flushing of the
* output buffers. By default the output buffers flush frequently to provide
* low latency and to aid smooth developer experience. Setting the parameter
* can result in three logical modes:
*
*
* - A positive integer triggers flushing periodically by that integer
* - 0 triggers flushing after every record thus minimizing latency
* - -1 triggers flushing only when the output buffer is full thus maximizing throughput
*
*/
def setBufferTimeout(timeoutMillis: Long): StreamExecutionEnvironment = {
javaEnv.setBufferTimeout(timeoutMillis)
this
}
/**
* Gets the default buffer timeout set for this environment
*/
def getBufferTimeout = javaEnv.getBufferTimeout
def getJobListeners: java.util.List[JobListener] = javaEnv.getJobListeners
def addJobListener(jobListener: JobListener) = {
javaEnv.addJobListener(jobListener)
}
/**
* Disables operator chaining for streaming operators. Operator chaining
* allows non-shuffle operations to be co-located in the same thread fully
* avoiding serialization and de-serialization.
*
*/
@PublicEvolving
def disableOperatorChaining(): StreamExecutionEnvironment = {
javaEnv.disableOperatorChaining()
this
}
def setMultiHeadChainMode(multiHeadChainMode: Boolean): StreamExecutionEnvironment = {
javaEnv.setMultiHeadChainMode(multiHeadChainMode)
this
}
def isMultiHeadChainMode = javaEnv.isMultiHeadChainMode;
def disableCheckpointing(): StreamExecutionEnvironment = {
javaEnv.disableCheckpointing()
this
}
def enableSlotSharing(): StreamExecutionEnvironment = {
javaEnv.enableSlotSharing()
this
}
def disableSlotSharing(): StreamExecutionEnvironment = {
javaEnv.disableSlotSharing()
this
}
def isSlotSharingEnabled = javaEnv.isSlotSharingEnabled
def getDefaultResources: ResourceSpec = javaEnv.getDefaultResources;
def setDefaultResources(resources: ResourceSpec): StreamExecutionEnvironment = {
javaEnv.setDefaultResources(resources)
this
}
def setJobType(jobType: JobType): StreamExecutionEnvironment = {
javaEnv.setJobType(jobType)
this
}
def clearTransformations: StreamExecutionEnvironment = {
javaEnv.clearTransformations();
this
}
// ------------------------------------------------------------------------
// Checkpointing Settings
// ------------------------------------------------------------------------
/**
* Gets the checkpoint config, which defines values like checkpoint interval, delay between
* checkpoints, etc.
*/
def getCheckpointConfig = javaEnv.getCheckpointConfig()
/**
* Enables checkpointing for the streaming job. The distributed state of the streaming
* dataflow will be periodically snapshotted. In case of a failure, the streaming
* dataflow will be restarted from the latest completed checkpoint.
*
* The job draws checkpoints periodically, in the given interval. The state will be
* stored in the configured state backend.
*
* NOTE: Checkpointing iterative streaming dataflows in not properly supported at
* the moment. If the "force" parameter is set to true, the system will execute the
* job nonetheless.
*
* @param interval
* Time interval between state checkpoints in millis.
* @param mode
* The checkpointing mode, selecting between "exactly once" and "at least once" guarantees.
* @param force
* If true checkpointing will be enabled for iterative jobs as well.
*/
@deprecated
@PublicEvolving
def enableCheckpointing(interval : Long,
mode: CheckpointingMode,
force: Boolean) : StreamExecutionEnvironment = {
javaEnv.enableCheckpointing(interval, mode, force)
this
}
/**
* Enables checkpointing for the streaming job. The distributed state of the streaming
* dataflow will be periodically snapshotted. In case of a failure, the streaming
* dataflow will be restarted from the latest completed checkpoint.
*
* The job draws checkpoints periodically, in the given interval. The system uses the
* given [[CheckpointingMode]] for the checkpointing ("exactly once" vs "at least once").
* The state will be stored in the configured state backend.
*
* NOTE: Checkpointing iterative streaming dataflows in not properly supported at
* the moment. For that reason, iterative jobs will not be started if used
* with enabled checkpointing. To override this mechanism, use the
* [[enableCheckpointing(long, CheckpointingMode, boolean)]] method.
*
* @param interval
* Time interval between state checkpoints in milliseconds.
* @param mode
* The checkpointing mode, selecting between "exactly once" and "at least once" guarantees.
*/
def enableCheckpointing(interval : Long,
mode: CheckpointingMode) : StreamExecutionEnvironment = {
javaEnv.enableCheckpointing(interval, mode)
this
}
/**
* Enables checkpointing for the streaming job. The distributed state of the streaming
* dataflow will be periodically snapshotted. In case of a failure, the streaming
* dataflow will be restarted from the latest completed checkpoint.
*
* The job draws checkpoints periodically, in the given interval. The program will use
* [[CheckpointingMode.EXACTLY_ONCE]] mode. The state will be stored in the
* configured state backend.
*
* NOTE: Checkpointing iterative streaming dataflows in not properly supported at
* the moment. For that reason, iterative jobs will not be started if used
* with enabled checkpointing. To override this mechanism, use the
* [[enableCheckpointing(long, CheckpointingMode, boolean)]] method.
*
* @param interval
* Time interval between state checkpoints in milliseconds.
*/
def enableCheckpointing(interval : Long) : StreamExecutionEnvironment = {
enableCheckpointing(interval, CheckpointingMode.EXACTLY_ONCE)
}
/**
* Method for enabling fault-tolerance. Activates monitoring and backup of streaming
* operator states. Time interval between state checkpoints is specified in in millis.
*
* Setting this option assumes that the job is used in production and thus if not stated
* explicitly otherwise with calling the [[setRestartStrategy]] method in case of
* failure the job will be resubmitted to the cluster indefinitely.
*/
@deprecated
@PublicEvolving
def enableCheckpointing() : StreamExecutionEnvironment = {
javaEnv.enableCheckpointing()
this
}
def getCheckpointingMode = javaEnv.getCheckpointingMode()
/**
* Sets the state backend that describes how to store and checkpoint operator state. It defines
* both which data structures hold state during execution (for example hash tables, RockDB,
* or other data stores) as well as where checkpointed data will be persisted.
*
* State managed by the state backend includes both keyed state that is accessible on
* [[org.apache.flink.streaming.api.datastream.KeyedStream keyed streams]], as well as
* state maintained directly by the user code that implements
* [[org.apache.flink.streaming.api.checkpoint.CheckpointedFunction CheckpointedFunction]].
*
* The [[org.apache.flink.runtime.state.memory.MemoryStateBackend]], for example,
* maintains the state in heap memory, as objects. It is lightweight without extra dependencies,
* but can checkpoint only small states (some counters).
*
* In contrast, the [[org.apache.flink.runtime.state.filesystem.FsStateBackend]]
* stores checkpoints of the state (also maintained as heap objects) in files.
* When using a replicated file system (like HDFS, S3, MapR FS, Tachyon, etc) this will guarantee
* that state is not lost upon failures of individual nodes and that streaming program can be
* executed highly available and strongly consistent.
*/
@PublicEvolving
def setStateBackend(backend: StateBackend): StreamExecutionEnvironment = {
javaEnv.setStateBackend(backend)
this
}
/**
* @deprecated Use [[StreamExecutionEnvironment.setStateBackend(StateBackend)]] instead.
*/
@Deprecated
@PublicEvolving
def setStateBackend(backend: AbstractStateBackend): StreamExecutionEnvironment = {
setStateBackend(backend.asInstanceOf[StateBackend])
}
/**
* Returns the state backend that defines how to store and checkpoint state.
*/
@PublicEvolving
def getStateBackend: StateBackend = javaEnv.getStateBackend()
/**
* Sets the restart strategy configuration. The configuration specifies which restart strategy
* will be used for the execution graph in case of a restart.
*
* @param restartStrategyConfiguration Restart strategy configuration to be set
*/
@PublicEvolving
def setRestartStrategy(restartStrategyConfiguration: RestartStrategyConfiguration): Unit = {
javaEnv.setRestartStrategy(restartStrategyConfiguration)
}
/**
* Returns the specified restart strategy configuration.
*
* @return The restart strategy configuration to be used
*/
@PublicEvolving
def getRestartStrategy: RestartStrategyConfiguration = {
javaEnv.getRestartStrategy()
}
/**
* Sets the number of times that failed tasks are re-executed. A value of zero
* effectively disables fault tolerance. A value of "-1" indicates that the system
* default value (as defined in the configuration) should be used.
*
* @deprecated This method will be replaced by [[setRestartStrategy()]]. The
* FixedDelayRestartStrategyConfiguration contains the number of execution retries.
*/
@PublicEvolving
def setNumberOfExecutionRetries(numRetries: Int): Unit = {
javaEnv.setNumberOfExecutionRetries(numRetries)
}
/**
* Gets the number of times the system will try to re-execute failed tasks. A value
* of "-1" indicates that the system default value (as defined in the configuration)
* should be used.
*
* @deprecated This method will be replaced by [[getRestartStrategy]]. The
* FixedDelayRestartStrategyConfiguration contains the number of execution retries.
*/
@PublicEvolving
def getNumberOfExecutionRetries = javaEnv.getNumberOfExecutionRetries
// --------------------------------------------------------------------------------------------
// Registry for types and serializers
// --------------------------------------------------------------------------------------------
/**
* Adds a new Kryo default serializer to the Runtime.
*
* Note that the serializer instance must be serializable (as defined by
* java.io.Serializable), because it may be distributed to the worker nodes
* by java serialization.
*
* @param type
* The class of the types serialized with the given serializer.
* @param serializer
* The serializer to use.
*/
def addDefaultKryoSerializer[T <: Serializer[_] with Serializable](
`type`: Class[_],
serializer: T)
: Unit = {
javaEnv.addDefaultKryoSerializer(`type`, serializer)
}
/**
* Adds a new Kryo default serializer to the Runtime.
*
* @param type
* The class of the types serialized with the given serializer.
* @param serializerClass
* The class of the serializer to use.
*/
def addDefaultKryoSerializer(`type`: Class[_], serializerClass: Class[_ <: Serializer[_]]) {
javaEnv.addDefaultKryoSerializer(`type`, serializerClass)
}
/**
* Registers the given type with the serializer at the [[KryoSerializer]].
*
* Note that the serializer instance must be serializable (as defined by java.io.Serializable),
* because it may be distributed to the worker nodes by java serialization.
*/
def registerTypeWithKryoSerializer[T <: Serializer[_] with Serializable](
clazz: Class[_],
serializer: T)
: Unit = {
javaEnv.registerTypeWithKryoSerializer(clazz, serializer)
}
/**
* Registers the given type with the serializer at the [[KryoSerializer]].
*/
def registerTypeWithKryoSerializer(clazz: Class[_], serializer: Class[_ <: Serializer[_]]) {
javaEnv.registerTypeWithKryoSerializer(clazz, serializer)
}
/**
* Registers the given type with the serialization stack. If the type is eventually
* serialized as a POJO, then the type is registered with the POJO serializer. If the
* type ends up being serialized with Kryo, then it will be registered at Kryo to make
* sure that only tags are written.
*
*/
def registerType(typeClass: Class[_]) {
javaEnv.registerType(typeClass)
}
// --------------------------------------------------------------------------------------------
// Time characteristic
// --------------------------------------------------------------------------------------------
/**
* Sets the time characteristic for all streams create from this environment, e.g., processing
* time, event time, or ingestion time.
*
* If you set the characteristic to IngestionTime of EventTime this will set a default
* watermark update interval of 200 ms. If this is not applicable for your application
* you should change it using
* [[org.apache.flink.api.common.ExecutionConfig#setAutoWatermarkInterval(long)]]
*
* @param characteristic The time characteristic.
*/
@PublicEvolving
def setStreamTimeCharacteristic(characteristic: TimeCharacteristic) : Unit = {
javaEnv.setStreamTimeCharacteristic(characteristic)
}
/**
* Gets the time characteristic/
*
* @see #setStreamTimeCharacteristic
* @return The time characteristic.
*/
@PublicEvolving
def getStreamTimeCharacteristic = javaEnv.getStreamTimeCharacteristic()
/**
* Returns the custom configuration for the environment.
*/
def getCustomConfiguration: Configuration = javaEnv.getCustomConfiguration()
// --------------------------------------------------------------------------------------------
// Data stream creations
// --------------------------------------------------------------------------------------------
/**
* Creates a new DataStream that contains a sequence of numbers. This source is a parallel source.
* If you manually set the parallelism to `1` the emitted elements are in order.
*/
def generateSequence(from: Long, to: Long): DataStream[Long] = {
new DataStream[java.lang.Long](javaEnv.generateSequence(from, to))
.asInstanceOf[DataStream[Long]]
}
/**
* Creates a DataStream that contains the given elements. The elements must all be of the
* same type.
*
* Note that this operation will result in a non-parallel data source, i.e. a data source with
* a parallelism of one.
*/
def fromElements[T: TypeInformation](data: T*): DataStream[T] = {
fromCollection(data)
}
/**
* Creates a DataStream that contains the given elements. The elements must all be of the
* same type.
*
* Note that this operation will result in a non-parallel data source v2, i.e. a data source with
* a parallelism of one.
*/
def fromElementsV2[T: TypeInformation](data: T*): DataStream[T] = {
fromCollectionV2(data)
}
/**
* Creates a DataStream from the given non-empty [[Seq]]. The elements need to be serializable
* because the framework may move the elements into the cluster if needed.
*
* Note that this operation will result in a non-parallel data source, i.e. a data source with
* a parallelism of one.
*/
def fromCollection[T: TypeInformation](data: Seq[T]): DataStream[T] = {
require(data != null, "Data must not be null.")
val typeInfo = implicitly[TypeInformation[T]]
val collection = scala.collection.JavaConversions.asJavaCollection(data)
asScalaStream(javaEnv.fromCollection(collection, typeInfo))
}
/**
* Creates a DataStream from the given non-empty [[Seq]]. The elements need to be serializable
* because the framework may move the elements into the cluster if needed.
*
* Note that this operation will result in a non-parallel data source v2, i.e. a data source with
* a parallelism of one.
*/
def fromCollectionV2[T: TypeInformation](data: Seq[T]): DataStream[T] = {
require(data != null, "Data must not be null.")
val typeInfo = implicitly[TypeInformation[T]]
val collection = scala.collection.JavaConversions.asJavaCollection(data)
asScalaStream(javaEnv.fromCollectionV2(collection, typeInfo))
}
/**
* Creates a DataStream from the given [[Iterator]].
*
* Note that this operation will result in a non-parallel data source, i.e. a data source with
* a parallelism of one.
*/
def fromCollection[T: TypeInformation] (data: Iterator[T]): DataStream[T] = {
val typeInfo = implicitly[TypeInformation[T]]
asScalaStream(javaEnv.fromCollection(data.asJava, typeInfo))
}
/**
* Creates a DataStream from the given [[SplittableIterator]].
*/
def fromParallelCollection[T: TypeInformation] (data: SplittableIterator[T]):
DataStream[T] = {
val typeInfo = implicitly[TypeInformation[T]]
asScalaStream(javaEnv.fromParallelCollection(data, typeInfo))
}
/**
* Creates a DataStream that represents the Strings produced by reading the
* given file line wise. The file will be read with the system's default
* character set.
*/
def readTextFile(filePath: String): DataStream[String] =
asScalaStream(javaEnv.readTextFile(filePath))
/**
* Creates a data stream that represents the Strings produced by reading the given file
* line wise. The character set with the given name will be used to read the files.
*/
def readTextFile(filePath: String, charsetName: String): DataStream[String] =
asScalaStream(javaEnv.readTextFile(filePath, charsetName))
/**
* Reads the given file with the given input format. The file path should be passed
* as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
*/
def readFile[T: TypeInformation](inputFormat: FileInputFormat[T], filePath: String):
DataStream[T] =
asScalaStream(javaEnv.readFile(inputFormat, filePath))
/**
* Creates a DataStream that contains the contents of file created while
* system watches the given path. The file will be read with the system's
* default character set. The user can check the monitoring interval in milliseconds,
* and the way file modifications are handled. By default it checks for only new files
* every 100 milliseconds.
*
*/
@Deprecated
def readFileStream(StreamPath: String, intervalMillis: Long = 100,
watchType: FileMonitoringFunction.WatchType =
FileMonitoringFunction.WatchType.ONLY_NEW_FILES): DataStream[String] =
asScalaStream(javaEnv.readFileStream(StreamPath, intervalMillis, watchType))
/**
* Reads the contents of the user-specified path based on the given [[FileInputFormat]].
* Depending on the provided [[FileProcessingMode]].
*
* @param inputFormat
* The input format used to create the data stream
* @param filePath
* The path of the file, as a URI (e.g., "file:///some/local/file" or
* "hdfs://host:port/file/path")
* @param watchType
* The mode in which the source should operate, i.e. monitor path and react
* to new data, or process once and exit
* @param interval
* In the case of periodic path monitoring, this specifies the interval (in millis)
* between consecutive path scans
* @param filter
* The files to be excluded from the processing
* @return The data stream that represents the data read from the given file
* @deprecated Use [[FileInputFormat#setFilesFilter(FilePathFilter)]] to set a filter and
* [[StreamExecutionEnvironment#readFile(FileInputFormat, String, FileProcessingMode, long)]]
*/
@PublicEvolving
@Deprecated
def readFile[T: TypeInformation](
inputFormat: FileInputFormat[T],
filePath: String,
watchType: FileProcessingMode,
interval: Long,
filter: FilePathFilter): DataStream[T] = {
asScalaStream(javaEnv.readFile(inputFormat, filePath, watchType, interval, filter))
}
/**
* Reads the contents of the user-specified path based on the given [[FileInputFormat]].
* Depending on the provided [[FileProcessingMode]], the source
* may periodically monitor (every `interval` ms) the path for new data
* ([[FileProcessingMode.PROCESS_CONTINUOUSLY]]), or process
* once the data currently in the path and exit
* ([[FileProcessingMode.PROCESS_ONCE]]). In addition,
* if the path contains files not to be processed, the user can specify a custom
* [[FilePathFilter]]. As a default implementation you can use
* [[FilePathFilter.createDefaultFilter()]].
*
* ** NOTES ON CHECKPOINTING: ** If the `watchType` is set to
* [[FileProcessingMode#PROCESS_ONCE]], the source monitors the path ** once **,
* creates the [[org.apache.flink.core.fs.FileInputSplit FileInputSplits]]
* to be processed, forwards them to the downstream
* [[ContinuousFileReaderOperator readers]] to read the actual data,
* and exits, without waiting for the readers to finish reading. This
* implies that no more checkpoint barriers are going to be forwarded
* after the source exits, thus having no checkpoints after that point.
*
* @param inputFormat
* The input format used to create the data stream
* @param filePath
* The path of the file, as a URI (e.g., "file:///some/local/file" or
* "hdfs://host:port/file/path")
* @param watchType
* The mode in which the source should operate, i.e. monitor path and react
* to new data, or process once and exit
* @param interval
* In the case of periodic path monitoring, this specifies the interval (in millis)
* between consecutive path scans
* @return The data stream that represents the data read from the given file
*/
@PublicEvolving
def readFile[T: TypeInformation](
inputFormat: FileInputFormat[T],
filePath: String,
watchType: FileProcessingMode,
interval: Long): DataStream[T] = {
val typeInfo = implicitly[TypeInformation[T]]
asScalaStream(javaEnv.readFile(inputFormat, filePath, watchType, interval, typeInfo))
}
/**
* Creates a new DataStream that contains the strings received infinitely
* from socket. Received strings are decoded by the system's default
* character set. The maximum retry interval is specified in seconds, in case
* of temporary service outage reconnection is initiated every second.
*/
@PublicEvolving
def socketTextStream(hostname: String, port: Int, delimiter: Char = '\n', maxRetry: Long = 0):
DataStream[String] =
asScalaStream(javaEnv.socketTextStream(hostname, port))
/**
* Generic method to create an input data stream with a specific input format.
* Since all data streams need specific information about their types, this method needs to
* determine the type of the data produced by the input format. It will attempt to determine the
* data type by reflection, unless the input format implements the ResultTypeQueryable interface.
*/
@PublicEvolving
def createInput[T: TypeInformation](inputFormat: InputFormat[T, _]): DataStream[T] =
if (inputFormat.isInstanceOf[ResultTypeQueryable[_]]) {
asScalaStream(javaEnv.createInput(inputFormat))
} else {
asScalaStream(javaEnv.createInput(inputFormat, implicitly[TypeInformation[T]]))
}
/**
* Generic method to create an input data stream with a specific input format.
* Since all data streams need specific information about their types, this method needs to
* determine the type of the data produced by the input format. It will attempt to determine the
* data type by reflection, unless the input format implements the ResultTypeQueryable interface.
*/
@PublicEvolving
def createInputV2[T: TypeInformation](inputFormat: InputFormat[T, _]): DataStream[T] =
if (inputFormat.isInstanceOf[ResultTypeQueryable[_]]) {
asScalaStream(javaEnv.createInputV2(inputFormat))
} else {
asScalaStream(javaEnv.createInputV2(inputFormat, implicitly[TypeInformation[T]]))
}
/**
* Create a DataStream using a user defined source function for arbitrary
* source functionality. By default sources have a parallelism of 1.
* To enable parallel execution, the user defined source should implement
* ParallelSourceFunction or extend RichParallelSourceFunction.
* In these cases the resulting source will have the parallelism of the environment.
* To change this afterwards call DataStreamSource.setParallelism(int)
*
*/
def addSource[T: TypeInformation](function: SourceFunction[T]): DataStream[T] = {
require(function != null, "Function must not be null.")
val cleanFun = scalaClean(function)
val typeInfo = implicitly[TypeInformation[T]]
asScalaStream(javaEnv.addSource(cleanFun).returns(typeInfo))
}
/**
* Create a DataStream using a user defined source function v2 for arbitrary
* source functionality. By default sources have a parallelism of 1.
* To enable parallel execution, the user defined source should implement
* ParallelSourceFunction or extend RichParallelSourceFunction.
* In these cases the resulting source will have the parallelism of the environment.
* To change this afterwards call DataStreamSource.setParallelism(int)
*
*/
def addSourceV2[T: TypeInformation](function: SourceFunctionV2[T]): DataStream[T] = {
require(function != null, "Function must not be null.")
val cleanFun = scalaClean(function)
val typeInfo = implicitly[TypeInformation[T]]
asScalaStream(javaEnv.addSourceV2(cleanFun).returns(typeInfo))
}
/**
* Create a DataStream using a user defined source function for arbitrary
* source functionality.
*/
def addSource[T: TypeInformation](function: SourceContext[T] => Unit): DataStream[T] = {
require(function != null, "Function must not be null.")
val sourceFunction = new SourceFunction[T] {
val cleanFun = scalaClean(function)
override def run(ctx: SourceContext[T]) {
cleanFun(ctx)
}
override def cancel() = {}
}
addSource(sourceFunction)
}
/**
* Triggers the program execution. The environment will execute all parts of
* the program that have resulted in a "sink" operation. Sink operations are
* for example printing results or forwarding them to a message queue.
*
* The program execution will be logged and displayed with a generated
* default name.
*/
def execute() = javaEnv.execute()
/**
* Triggers the program execution. The environment will execute all parts of
* the program that have resulted in a "sink" operation. Sink operations are
* for example printing results or forwarding them to a message queue.
*
* The program execution will be logged and displayed with the provided name.
*/
def execute(jobName: String) = javaEnv.execute(jobName)
def execute(jobName: String, savePointSetting: SavepointRestoreSettings) =
javaEnv.execute(jobName, savePointSetting)
def submit(jobName: String) = javaEnv.submit(jobName)
def submit() = javaEnv.submit()
def cancel(jobId: String) = javaEnv.cancel(jobId)
def cancelWithSavepoint(jobId: String, path: String) = javaEnv.cancelWithSavepoint(jobId, path)
def triggerSavepoint(jobId: String) = javaEnv.triggerSavepoint(jobId, null)
def triggerSavepoint(jobId: String, path: String) = javaEnv.triggerSavepoint(jobId, path)
/**
* Creates the plan with which the system will execute the program, and
* returns it as a String using a JSON representation of the execution data
* flow graph. Note that this needs to be called, before the plan is
* executed.
*/
def getExecutionPlan = javaEnv.getExecutionPlan
/**
* Getter of the [[org.apache.flink.streaming.api.graph.StreamGraph]] of the streaming job.
*
* @return The StreamGraph representing the transformations
*/
@Internal
def getStreamGraph = javaEnv.getStreamGraph
/**
* Getter of the wrapped [[org.apache.flink.streaming.api.environment.StreamExecutionEnvironment]]
*
* @return The encased ExecutionEnvironment
*/
@Internal
def getWrappedStreamExecutionEnvironment = javaEnv
/**
* Returns a "closure-cleaned" version of the given function. Cleans only if closure cleaning
* is not disabled in the [[org.apache.flink.api.common.ExecutionConfig]]
*/
private[flink] def scalaClean[F <: AnyRef](f: F): F = {
if (getConfig.isClosureCleanerEnabled) {
ClosureCleaner.clean(f, true)
} else {
ClosureCleaner.ensureSerializable(f)
}
f
}
/**
* Registers a file at the distributed cache under the given name. The file will be accessible
* from any user-defined function in the (distributed) runtime under a local path. Files
* may be local files (as long as all relevant workers have access to it), or files in a
* distributed file system. The runtime will copy the files temporarily to a local cache,
* if needed.
*
* The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs
* via {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and
* provides access {@link org.apache.flink.api.common.cache.DistributedCache} via
* {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}.
*
* @param filePath The path of the file, as a URI (e.g. "file:///some/path" or
* "hdfs://host:port/and/path")
* @param name The name under which the file is registered.
*/
def registerCachedFile(filePath: String, name: String): Unit = {
javaEnv.registerCachedFile(filePath, name)
}
/**
* Registers a file at the distributed cache under the given name. The file will be accessible
* from any user-defined function in the (distributed) runtime under a local path. Files
* may be local files (as long as all relevant workers have access to it), or files in a
* distributed file system. The runtime will copy the files temporarily to a local cache,
* if needed.
*
* The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs
* via {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and
* provides access {@link org.apache.flink.api.common.cache.DistributedCache} via
* {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}.
*
* @param filePath The path of the file, as a URI (e.g. "file:///some/path" or
* "hdfs://host:port/and/path")
* @param name The name under which the file is registered.
* @param executable flag indicating whether the file should be executable
*/
def registerCachedFile(filePath: String, name: String, executable: Boolean): Unit = {
javaEnv.registerCachedFile(filePath, name, executable)
}
/**
* Stop a submitted job with JobID.
* @param jobId
*/
def stopJob(jobId: JobID): Unit = {
javaEnv.stopJob(jobId)
}
}
object StreamExecutionEnvironment {
/**
* Sets the default parallelism that will be used for the local execution
* environment created by [[createLocalEnvironment()]].
*
* @param parallelism The default parallelism to use for local execution.
*/
@PublicEvolving
def setDefaultLocalParallelism(parallelism: Int) : Unit =
JavaEnv.setDefaultLocalParallelism(parallelism)
/**
* Gets the default parallelism that will be used for the local execution environment created by
* [[createLocalEnvironment()]].
*/
@PublicEvolving
def getDefaultLocalParallelism: Int = JavaEnv.getDefaultLocalParallelism
// --------------------------------------------------------------------------
// context environment
// --------------------------------------------------------------------------
/**
* Creates an execution environment that represents the context in which the program is
* currently executed. If the program is invoked standalone, this method returns a local
* execution environment. If the program is invoked from within the command line client
* to be submitted to a cluster, this method returns the execution environment of this cluster.
*/
def getExecutionEnvironment: StreamExecutionEnvironment = {
new StreamExecutionEnvironment(JavaEnv.getExecutionEnvironment)
}
// --------------------------------------------------------------------------
// local environment
// --------------------------------------------------------------------------
/**
* Creates a local execution environment. The local execution environment will run the
* program in a multi-threaded fashion in the same JVM as the environment was created in.
*
* This method sets the environment's default parallelism to given parameter, which
* defaults to the value set via [[setDefaultLocalParallelism(Int)]].
*/
def createLocalEnvironment(parallelism: Int = JavaEnv.getDefaultLocalParallelism):
StreamExecutionEnvironment = {
new StreamExecutionEnvironment(JavaEnv.createLocalEnvironment(parallelism))
}
/**
* Creates a local execution environment. The local execution environment will run the
* program in a multi-threaded fashion in the same JVM as the environment was created in.
*
* @param parallelism The parallelism for the local environment.
* @param configuration Pass a custom configuration into the cluster.
*/
def createLocalEnvironment(parallelism: Int, configuration: Configuration):
StreamExecutionEnvironment = {
new StreamExecutionEnvironment(JavaEnv.createLocalEnvironment(parallelism, configuration))
}
/**
* Creates a [[StreamExecutionEnvironment]] for local program execution that also starts the
* web monitoring UI.
*
* The local execution environment will run the program in a multi-threaded fashion in
* the same JVM as the environment was created in. It will use the parallelism specified in the
* parameter.
*
* If the configuration key 'rest.port' was set in the configuration, that particular
* port will be used for the web UI. Otherwise, the default port (8081) will be used.
*
* @param config optional config for the local execution
* @return The created StreamExecutionEnvironment
*/
@PublicEvolving
def createLocalEnvironmentWithWebUI(config: Configuration = null): StreamExecutionEnvironment = {
val conf: Configuration = if (config == null) new Configuration() else config
new StreamExecutionEnvironment(JavaEnv.createLocalEnvironmentWithWebUI(conf))
}
// --------------------------------------------------------------------------
// remote environment
// --------------------------------------------------------------------------
/**
* Creates a remote execution environment. The remote environment sends (parts of) the program to
* a cluster for execution. Note that all file paths used in the program must be accessible from
* the cluster. The execution will use the cluster's default parallelism, unless the
* parallelism is set explicitly via [[StreamExecutionEnvironment.setParallelism()]].
*
* @param host The host name or address of the master (JobManager),
* where the program should be executed.
* @param port The port of the master (JobManager), where the program should be executed.
* @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the
* program uses
* user-defined functions, user-defined input formats, or any libraries,
* those must be
* provided in the JAR files.
*/
def createRemoteEnvironment(host: String, port: Int, jarFiles: String*):
StreamExecutionEnvironment = {
new StreamExecutionEnvironment(JavaEnv.createRemoteEnvironment(host, port, jarFiles: _*))
}
/**
* Creates a remote execution environment. The remote environment sends (parts of) the program
* to a cluster for execution. Note that all file paths used in the program must be accessible
* from the cluster. The execution will use the specified parallelism.
*
* @param host The host name or address of the master (JobManager),
* where the program should be executed.
* @param port The port of the master (JobManager), where the program should be executed.
* @param parallelism The parallelism to use during the execution.
* @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the
* program uses
* user-defined functions, user-defined input formats, or any libraries,
* those must be
* provided in the JAR files.
*/
def createRemoteEnvironment(
host: String,
port: Int,
parallelism: Int,
jarFiles: String*): StreamExecutionEnvironment = {
val javaEnv = JavaEnv.createRemoteEnvironment(host, port, jarFiles: _*)
javaEnv.setParallelism(parallelism)
new StreamExecutionEnvironment(javaEnv)
}
}