All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.streaming.api.datastream.DataStream Maven / Gradle / Ivy

There is a newer version: 2.0-preview1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.api.datastream;

import org.apache.flink.annotation.Experimental;
import org.apache.flink.annotation.Internal;
import org.apache.flink.annotation.Public;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.eventtime.TimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkGenerator;
import org.apache.flink.api.common.eventtime.WatermarkOutput;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.Partitioner;
import org.apache.flink.api.common.functions.RichFilterFunction;
import org.apache.flink.api.common.functions.RichFlatMapFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.io.OutputFormat;
import org.apache.flink.api.common.operators.Keys;
import org.apache.flink.api.common.operators.ResourceSpec;
import org.apache.flink.api.common.serialization.SerializationSchema;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo;
import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.connector.sink2.Sink;
import org.apache.flink.api.dag.Transformation;
import org.apache.flink.api.java.Utils;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.io.CsvOutputFormat;
import org.apache.flink.api.java.io.TextOutputFormat;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.typeutils.InputTypeConfigurable;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.core.execution.JobClient;
import org.apache.flink.core.fs.FileSystem.WriteMode;
import org.apache.flink.core.fs.Path;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks;
import org.apache.flink.streaming.api.functions.AssignerWithPunctuatedWatermarks;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.sink.OutputFormatSinkFunction;
import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.api.functions.sink.SocketClientSink;
import org.apache.flink.streaming.api.operators.OneInputStreamOperator;
import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory;
import org.apache.flink.streaming.api.operators.ProcessOperator;
import org.apache.flink.streaming.api.operators.SimpleOperatorFactory;
import org.apache.flink.streaming.api.operators.StreamFilter;
import org.apache.flink.streaming.api.operators.StreamFlatMap;
import org.apache.flink.streaming.api.operators.StreamMap;
import org.apache.flink.streaming.api.operators.StreamOperatorFactory;
import org.apache.flink.streaming.api.operators.collect.ClientAndIterator;
import org.apache.flink.streaming.api.operators.collect.CollectResultIterator;
import org.apache.flink.streaming.api.operators.collect.CollectSinkOperator;
import org.apache.flink.streaming.api.operators.collect.CollectSinkOperatorFactory;
import org.apache.flink.streaming.api.operators.collect.CollectStreamSink;
import org.apache.flink.streaming.api.transformations.OneInputTransformation;
import org.apache.flink.streaming.api.transformations.PartitionTransformation;
import org.apache.flink.streaming.api.transformations.TimestampsAndWatermarksTransformation;
import org.apache.flink.streaming.api.transformations.UnionTransformation;
import org.apache.flink.streaming.api.windowing.assigners.GlobalWindows;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.SlidingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.WindowAssigner;
import org.apache.flink.streaming.api.windowing.evictors.CountEvictor;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.triggers.CountTrigger;
import org.apache.flink.streaming.api.windowing.triggers.PurgingTrigger;
import org.apache.flink.streaming.api.windowing.windows.GlobalWindow;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.api.windowing.windows.Window;
import org.apache.flink.streaming.runtime.operators.util.AssignerWithPeriodicWatermarksAdapter;
import org.apache.flink.streaming.runtime.operators.util.AssignerWithPunctuatedWatermarksAdapter;
import org.apache.flink.streaming.runtime.partitioner.BroadcastPartitioner;
import org.apache.flink.streaming.runtime.partitioner.CustomPartitionerWrapper;
import org.apache.flink.streaming.runtime.partitioner.ForwardPartitioner;
import org.apache.flink.streaming.runtime.partitioner.GlobalPartitioner;
import org.apache.flink.streaming.runtime.partitioner.RebalancePartitioner;
import org.apache.flink.streaming.runtime.partitioner.RescalePartitioner;
import org.apache.flink.streaming.runtime.partitioner.ShufflePartitioner;
import org.apache.flink.streaming.runtime.partitioner.StreamPartitioner;
import org.apache.flink.streaming.util.keys.KeySelectorUtil;
import org.apache.flink.util.CloseableIterator;
import org.apache.flink.util.OutputTag;
import org.apache.flink.util.Preconditions;

import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

/**
 * A DataStream represents a stream of elements of the same type. A DataStream can be transformed
 * into another DataStream by applying a transformation as for example:
 *
 * 
    *
  • {@link DataStream#map} *
  • {@link DataStream#filter} *
* * @param The type of the elements in this stream. */ @Public public class DataStream { protected final StreamExecutionEnvironment environment; protected final Transformation transformation; /** * Create a new {@link DataStream} in the given execution environment with partitioning set to * forward by default. * * @param environment The StreamExecutionEnvironment */ public DataStream(StreamExecutionEnvironment environment, Transformation transformation) { this.environment = Preconditions.checkNotNull(environment, "Execution Environment must not be null."); this.transformation = Preconditions.checkNotNull( transformation, "Stream Transformation must not be null."); } /** * Returns the ID of the {@link DataStream} in the current {@link StreamExecutionEnvironment}. * * @return ID of the DataStream */ @Internal public int getId() { return transformation.getId(); } /** * Gets the parallelism for this operator. * * @return The parallelism set for this operator. */ public int getParallelism() { return transformation.getParallelism(); } /** * Gets the minimum resources for this operator. * * @return The minimum resources set for this operator. */ @PublicEvolving public ResourceSpec getMinResources() { return transformation.getMinResources(); } /** * Gets the preferred resources for this operator. * * @return The preferred resources set for this operator. */ @PublicEvolving public ResourceSpec getPreferredResources() { return transformation.getPreferredResources(); } /** * Gets the type of the stream. * * @return The type of the datastream. */ public TypeInformation getType() { return transformation.getOutputType(); } /** * Invokes the {@link org.apache.flink.api.java.ClosureCleaner} on the given function if closure * cleaning is enabled in the {@link ExecutionConfig}. * * @return The cleaned Function */ protected F clean(F f) { return getExecutionEnvironment().clean(f); } /** * Returns the {@link StreamExecutionEnvironment} that was used to create this {@link * DataStream}. * * @return The Execution Environment */ public StreamExecutionEnvironment getExecutionEnvironment() { return environment; } public ExecutionConfig getExecutionConfig() { return environment.getConfig(); } /** * Creates a new {@link DataStream} by merging {@link DataStream} outputs of the same type with * each other. The DataStreams merged using this operator will be transformed simultaneously. * * @param streams The DataStreams to union output with. * @return The {@link DataStream}. */ @SafeVarargs public final DataStream union(DataStream... streams) { List> unionedTransforms = new ArrayList<>(); unionedTransforms.add(this.transformation); for (DataStream newStream : streams) { if (!getType().equals(newStream.getType())) { throw new IllegalArgumentException( "Cannot union streams of different types: " + getType() + " and " + newStream.getType()); } unionedTransforms.add(newStream.getTransformation()); } return new DataStream<>(this.environment, new UnionTransformation<>(unionedTransforms)); } /** * Creates a new {@link ConnectedStreams} by connecting {@link DataStream} outputs of (possible) * different types with each other. The DataStreams connected using this operator can be used * with CoFunctions to apply joint transformations. * * @param dataStream The DataStream with which this stream will be connected. * @return The {@link ConnectedStreams}. */ public ConnectedStreams connect(DataStream dataStream) { return new ConnectedStreams<>(environment, this, dataStream); } /** * Creates a new {@link BroadcastConnectedStream} by connecting the current {@link DataStream} * or {@link KeyedStream} with a {@link BroadcastStream}. * *

The latter can be created using the {@link #broadcast(MapStateDescriptor[])} method. * *

The resulting stream can be further processed using the {@code * BroadcastConnectedStream.process(MyFunction)} method, where {@code MyFunction} can be either * a {@link org.apache.flink.streaming.api.functions.co.KeyedBroadcastProcessFunction * KeyedBroadcastProcessFunction} or a {@link * org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction * BroadcastProcessFunction} depending on the current stream being a {@link KeyedStream} or not. * * @param broadcastStream The broadcast stream with the broadcast state to be connected with * this stream. * @return The {@link BroadcastConnectedStream}. */ @PublicEvolving public BroadcastConnectedStream connect(BroadcastStream broadcastStream) { return new BroadcastConnectedStream<>( environment, this, Preconditions.checkNotNull(broadcastStream), broadcastStream.getBroadcastStateDescriptors()); } /** * It creates a new {@link KeyedStream} that uses the provided key for partitioning its operator * states. * * @param key The KeySelector to be used for extracting the key for partitioning * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) */ public KeyedStream keyBy(KeySelector key) { Preconditions.checkNotNull(key); return new KeyedStream<>(this, clean(key)); } /** * It creates a new {@link KeyedStream} that uses the provided key with explicit type * information for partitioning its operator states. * * @param key The KeySelector to be used for extracting the key for partitioning. * @param keyType The type information describing the key type. * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) */ public KeyedStream keyBy(KeySelector key, TypeInformation keyType) { Preconditions.checkNotNull(key); Preconditions.checkNotNull(keyType); return new KeyedStream<>(this, clean(key), keyType); } /** * Partitions the operator state of a {@link DataStream} by the given key positions. * * @deprecated Use {@link DataStream#keyBy(KeySelector)}. * @param fields The position of the fields on which the {@link DataStream} will be grouped. * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) */ @Deprecated public KeyedStream keyBy(int... fields) { if (getType() instanceof BasicArrayTypeInfo || getType() instanceof PrimitiveArrayTypeInfo) { return keyBy(KeySelectorUtil.getSelectorForArray(fields, getType())); } else { return keyBy(new Keys.ExpressionKeys<>(fields, getType())); } } /** * Partitions the operator state of a {@link DataStream} using field expressions. A field * expression is either the name of a public field or a getter method with parentheses of the * {@link DataStream}'s underlying type. A dot can be used to drill down into objects, as in * {@code "field1.getInnerField2()" }. * * @deprecated Use {@link DataStream#keyBy(KeySelector)}. * @param fields One or more field expressions on which the state of the {@link DataStream} * operators will be partitioned. * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) */ @Deprecated public KeyedStream keyBy(String... fields) { return keyBy(new Keys.ExpressionKeys<>(fields, getType())); } private KeyedStream keyBy(Keys keys) { return new KeyedStream<>( this, clean(KeySelectorUtil.getSelectorForKeys(keys, getType(), getExecutionConfig()))); } /** * Partitions a tuple DataStream on the specified key fields using a custom partitioner. This * method takes the key position to partition on, and a partitioner that accepts the key type. * *

Note: This method works only on single field keys. * * @deprecated use {@link DataStream#partitionCustom(Partitioner, KeySelector)}. * @param partitioner The partitioner to assign partitions to keys. * @param field The field index on which the DataStream is partitioned. * @return The partitioned DataStream. */ @Deprecated public DataStream partitionCustom(Partitioner partitioner, int field) { Keys.ExpressionKeys outExpressionKeys = new Keys.ExpressionKeys<>(new int[] {field}, getType()); return partitionCustom(partitioner, outExpressionKeys); } /** * Partitions a POJO DataStream on the specified key fields using a custom partitioner. This * method takes the key expression to partition on, and a partitioner that accepts the key type. * *

Note: This method works only on single field keys. * * @deprecated use {@link DataStream#partitionCustom(Partitioner, KeySelector)}. * @param partitioner The partitioner to assign partitions to keys. * @param field The expression for the field on which the DataStream is partitioned. * @return The partitioned DataStream. */ @Deprecated public DataStream partitionCustom(Partitioner partitioner, String field) { Keys.ExpressionKeys outExpressionKeys = new Keys.ExpressionKeys<>(new String[] {field}, getType()); return partitionCustom(partitioner, outExpressionKeys); } /** * Partitions a DataStream on the key returned by the selector, using a custom partitioner. This * method takes the key selector to get the key to partition on, and a partitioner that accepts * the key type. * *

Note: This method works only on single field keys, i.e. the selector cannot return tuples * of fields. * * @param partitioner The partitioner to assign partitions to keys. * @param keySelector The KeySelector with which the DataStream is partitioned. * @return The partitioned DataStream. * @see KeySelector */ public DataStream partitionCustom( Partitioner partitioner, KeySelector keySelector) { return setConnectionType( new CustomPartitionerWrapper<>(clean(partitioner), clean(keySelector))); } // private helper method for custom partitioning private DataStream partitionCustom(Partitioner partitioner, Keys keys) { KeySelector keySelector = KeySelectorUtil.getSelectorForOneKey( keys, partitioner, getType(), getExecutionConfig()); return setConnectionType( new CustomPartitionerWrapper<>(clean(partitioner), clean(keySelector))); } /** * Sets the partitioning of the {@link DataStream} so that the output elements are broadcasted * to every parallel instance of the next operation. * * @return The DataStream with broadcast partitioning set. */ public DataStream broadcast() { return setConnectionType(new BroadcastPartitioner()); } /** * Sets the partitioning of the {@link DataStream} so that the output elements are broadcasted * to every parallel instance of the next operation. In addition, it implicitly as many {@link * org.apache.flink.api.common.state.BroadcastState broadcast states} as the specified * descriptors which can be used to store the element of the stream. * * @param broadcastStateDescriptors the descriptors of the broadcast states to create. * @return A {@link BroadcastStream} which can be used in the {@link #connect(BroadcastStream)} * to create a {@link BroadcastConnectedStream} for further processing of the elements. */ @PublicEvolving public BroadcastStream broadcast( final MapStateDescriptor... broadcastStateDescriptors) { Preconditions.checkNotNull(broadcastStateDescriptors); final DataStream broadcastStream = setConnectionType(new BroadcastPartitioner<>()); return new BroadcastStream<>(environment, broadcastStream, broadcastStateDescriptors); } /** * Sets the partitioning of the {@link DataStream} so that the output elements are shuffled * uniformly randomly to the next operation. * * @return The DataStream with shuffle partitioning set. */ @PublicEvolving public DataStream shuffle() { return setConnectionType(new ShufflePartitioner()); } /** * Sets the partitioning of the {@link DataStream} so that the output elements are forwarded to * the local subtask of the next operation. * * @return The DataStream with forward partitioning set. */ public DataStream forward() { return setConnectionType(new ForwardPartitioner()); } /** * Sets the partitioning of the {@link DataStream} so that the output elements are distributed * evenly to instances of the next operation in a round-robin fashion. * * @return The DataStream with rebalance partitioning set. */ public DataStream rebalance() { return setConnectionType(new RebalancePartitioner()); } /** * Sets the partitioning of the {@link DataStream} so that the output elements are distributed * evenly to a subset of instances of the next operation in a round-robin fashion. * *

The subset of downstream operations to which the upstream operation sends elements depends * on the degree of parallelism of both the upstream and downstream operation. For example, if * the upstream operation has parallelism 2 and the downstream operation has parallelism 4, then * one upstream operation would distribute elements to two downstream operations while the other * upstream operation would distribute to the other two downstream operations. If, on the other * hand, the downstream operation has parallelism 2 while the upstream operation has parallelism * 4 then two upstream operations will distribute to one downstream operation while the other * two upstream operations will distribute to the other downstream operations. * *

In cases where the different parallelisms are not multiples of each other one or several * downstream operations will have a differing number of inputs from upstream operations. * * @return The DataStream with rescale partitioning set. */ @PublicEvolving public DataStream rescale() { return setConnectionType(new RescalePartitioner()); } /** * Sets the partitioning of the {@link DataStream} so that the output values all go to the first * instance of the next processing operator. Use this setting with care since it might cause a * serious performance bottleneck in the application. * * @return The DataStream with shuffle partitioning set. */ @PublicEvolving public DataStream global() { return setConnectionType(new GlobalPartitioner()); } /** * Initiates an iterative part of the program that feeds back data streams. The iterative part * needs to be closed by calling {@link IterativeStream#closeWith(DataStream)}. The * transformation of this IterativeStream will be the iteration head. The data stream given to * the {@link IterativeStream#closeWith(DataStream)} method is the data stream that will be fed * back and used as the input for the iteration head. The user can also use different feedback * type than the input of the iteration and treat the input and feedback streams as a {@link * ConnectedStreams} be calling {@link IterativeStream#withFeedbackType(TypeInformation)} * *

A common usage pattern for streaming iterations is to use output splitting to send a part * of the closing data stream to the head. Refer to {@link * ProcessFunction.Context#output(OutputTag, Object)} for more information. * *

The iteration edge will be partitioned the same way as the first input of the iteration * head unless it is changed in the {@link IterativeStream#closeWith(DataStream)} call. * *

By default a DataStream with iteration will never terminate, but the user can use the * maxWaitTime parameter to set a max waiting time for the iteration head. If no data received * in the set time, the stream terminates. * * @return The iterative data stream created. */ @PublicEvolving public IterativeStream iterate() { return new IterativeStream<>(this, 0); } /** * Initiates an iterative part of the program that feeds back data streams. The iterative part * needs to be closed by calling {@link IterativeStream#closeWith(DataStream)}. The * transformation of this IterativeStream will be the iteration head. The data stream given to * the {@link IterativeStream#closeWith(DataStream)} method is the data stream that will be fed * back and used as the input for the iteration head. The user can also use different feedback * type than the input of the iteration and treat the input and feedback streams as a {@link * ConnectedStreams} be calling {@link IterativeStream#withFeedbackType(TypeInformation)} * *

A common usage pattern for streaming iterations is to use output splitting to send a part * of the closing data stream to the head. Refer to {@link * ProcessFunction.Context#output(OutputTag, Object)} for more information. * *

The iteration edge will be partitioned the same way as the first input of the iteration * head unless it is changed in the {@link IterativeStream#closeWith(DataStream)} call. * *

By default a DataStream with iteration will never terminate, but the user can use the * maxWaitTime parameter to set a max waiting time for the iteration head. If no data received * in the set time, the stream terminates. * * @param maxWaitTimeMillis Number of milliseconds to wait between inputs before shutting down * @return The iterative data stream created. */ @PublicEvolving public IterativeStream iterate(long maxWaitTimeMillis) { return new IterativeStream<>(this, maxWaitTimeMillis); } /** * Applies a Map transformation on a {@link DataStream}. The transformation calls a {@link * MapFunction} for each element of the DataStream. Each MapFunction call returns exactly one * element. The user can also extend {@link RichMapFunction} to gain access to other features * provided by the {@link org.apache.flink.api.common.functions.RichFunction} interface. * * @param mapper The MapFunction that is called for each element of the DataStream. * @param output type * @return The transformed {@link DataStream}. */ public SingleOutputStreamOperator map(MapFunction mapper) { TypeInformation outType = TypeExtractor.getMapReturnTypes( clean(mapper), getType(), Utils.getCallLocationName(), true); return map(mapper, outType); } /** * Applies a Map transformation on a {@link DataStream}. The transformation calls a {@link * MapFunction} for each element of the DataStream. Each MapFunction call returns exactly one * element. The user can also extend {@link RichMapFunction} to gain access to other features * provided by the {@link org.apache.flink.api.common.functions.RichFunction} interface. * * @param mapper The MapFunction that is called for each element of the DataStream. * @param outputType {@link TypeInformation} for the result type of the function. * @param output type * @return The transformed {@link DataStream}. */ public SingleOutputStreamOperator map( MapFunction mapper, TypeInformation outputType) { return transform("Map", outputType, new StreamMap<>(clean(mapper))); } /** * Applies a FlatMap transformation on a {@link DataStream}. The transformation calls a {@link * FlatMapFunction} for each element of the DataStream. Each FlatMapFunction call can return any * number of elements including none. The user can also extend {@link RichFlatMapFunction} to * gain access to other features provided by the {@link * org.apache.flink.api.common.functions.RichFunction} interface. * * @param flatMapper The FlatMapFunction that is called for each element of the DataStream * @param output type * @return The transformed {@link DataStream}. */ public SingleOutputStreamOperator flatMap(FlatMapFunction flatMapper) { TypeInformation outType = TypeExtractor.getFlatMapReturnTypes( clean(flatMapper), getType(), Utils.getCallLocationName(), true); return flatMap(flatMapper, outType); } /** * Applies a FlatMap transformation on a {@link DataStream}. The transformation calls a {@link * FlatMapFunction} for each element of the DataStream. Each FlatMapFunction call can return any * number of elements including none. The user can also extend {@link RichFlatMapFunction} to * gain access to other features provided by the {@link * org.apache.flink.api.common.functions.RichFunction} interface. * * @param flatMapper The FlatMapFunction that is called for each element of the DataStream * @param outputType {@link TypeInformation} for the result type of the function. * @param output type * @return The transformed {@link DataStream}. */ public SingleOutputStreamOperator flatMap( FlatMapFunction flatMapper, TypeInformation outputType) { return transform("Flat Map", outputType, new StreamFlatMap<>(clean(flatMapper))); } /** * Applies the given {@link ProcessFunction} on the input stream, thereby creating a transformed * output stream. * *

The function will be called for every element in the input streams and can produce zero or * more output elements. * * @param processFunction The {@link ProcessFunction} that is called for each element in the * stream. * @param The type of elements emitted by the {@code ProcessFunction}. * @return The transformed {@link DataStream}. */ @PublicEvolving public SingleOutputStreamOperator process(ProcessFunction processFunction) { TypeInformation outType = TypeExtractor.getUnaryOperatorReturnType( processFunction, ProcessFunction.class, 0, 1, TypeExtractor.NO_INDEX, getType(), Utils.getCallLocationName(), true); return process(processFunction, outType); } /** * Applies the given {@link ProcessFunction} on the input stream, thereby creating a transformed * output stream. * *

The function will be called for every element in the input streams and can produce zero or * more output elements. * * @param processFunction The {@link ProcessFunction} that is called for each element in the * stream. * @param outputType {@link TypeInformation} for the result type of the function. * @param The type of elements emitted by the {@code ProcessFunction}. * @return The transformed {@link DataStream}. */ @Internal public SingleOutputStreamOperator process( ProcessFunction processFunction, TypeInformation outputType) { ProcessOperator operator = new ProcessOperator<>(clean(processFunction)); return transform("Process", outputType, operator); } /** * Applies a Filter transformation on a {@link DataStream}. The transformation calls a {@link * FilterFunction} for each element of the DataStream and retains only those element for which * the function returns true. Elements for which the function returns false are filtered. The * user can also extend {@link RichFilterFunction} to gain access to other features provided by * the {@link org.apache.flink.api.common.functions.RichFunction} interface. * * @param filter The FilterFunction that is called for each element of the DataStream. * @return The filtered DataStream. */ public SingleOutputStreamOperator filter(FilterFunction filter) { return transform("Filter", getType(), new StreamFilter<>(clean(filter))); } /** * Initiates a Project transformation on a {@link Tuple} {@link DataStream}.
* Note: Only Tuple DataStreams can be projected. * *

The transformation projects each Tuple of the DataSet onto a (sub)set of fields. * * @param fieldIndexes The field indexes of the input tuples that are retained. The order of * fields in the output tuple corresponds to the order of field indexes. * @return The projected DataStream * @see Tuple * @see DataStream */ @PublicEvolving public SingleOutputStreamOperator project(int... fieldIndexes) { return new StreamProjection<>(this, fieldIndexes).projectTupleX(); } /** * Creates a join operation. See {@link CoGroupedStreams} for an example of how the keys and * window can be specified. */ public CoGroupedStreams coGroup(DataStream otherStream) { return new CoGroupedStreams<>(this, otherStream); } /** * Creates a join operation. See {@link JoinedStreams} for an example of how the keys and window * can be specified. */ public JoinedStreams join(DataStream otherStream) { return new JoinedStreams<>(this, otherStream); } /** * Windows this {@code DataStream} into tumbling time windows. * *

This is a shortcut for either {@code .window(TumblingEventTimeWindows.of(size))} or {@code * .window(TumblingProcessingTimeWindows.of(size))} depending on the time characteristic set * using * *

Note: This operation is inherently non-parallel since all elements have to pass through * the same operator instance. * *

{@link * org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#setStreamTimeCharacteristic(org.apache.flink.streaming.api.TimeCharacteristic)} * * @param size The size of the window. * @deprecated Please use {@link #windowAll(WindowAssigner)} with either {@link * TumblingEventTimeWindows} or {@link TumblingProcessingTimeWindows}. For more information, * see the deprecation notice on {@link TimeCharacteristic} */ @Deprecated public AllWindowedStream timeWindowAll(Time size) { if (environment.getStreamTimeCharacteristic() == TimeCharacteristic.ProcessingTime) { return windowAll(TumblingProcessingTimeWindows.of(size)); } else { return windowAll(TumblingEventTimeWindows.of(size)); } } /** * Windows this {@code DataStream} into sliding time windows. * *

This is a shortcut for either {@code .window(SlidingEventTimeWindows.of(size, slide))} or * {@code .window(SlidingProcessingTimeWindows.of(size, slide))} depending on the time * characteristic set using {@link * org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#setStreamTimeCharacteristic(org.apache.flink.streaming.api.TimeCharacteristic)} * *

Note: This operation is inherently non-parallel since all elements have to pass through * the same operator instance. * * @param size The size of the window. * @deprecated Please use {@link #windowAll(WindowAssigner)} with either {@link * SlidingEventTimeWindows} or {@link SlidingProcessingTimeWindows}. For more information, * see the deprecation notice on {@link TimeCharacteristic} */ @Deprecated public AllWindowedStream timeWindowAll(Time size, Time slide) { if (environment.getStreamTimeCharacteristic() == TimeCharacteristic.ProcessingTime) { return windowAll(SlidingProcessingTimeWindows.of(size, slide)); } else { return windowAll(SlidingEventTimeWindows.of(size, slide)); } } /** * Windows this {@code DataStream} into tumbling count windows. * *

Note: This operation is inherently non-parallel since all elements have to pass through * the same operator instance. * * @param size The size of the windows in number of elements. */ public AllWindowedStream countWindowAll(long size) { return windowAll(GlobalWindows.create()).trigger(PurgingTrigger.of(CountTrigger.of(size))); } /** * Windows this {@code DataStream} into sliding count windows. * *

Note: This operation is inherently non-parallel since all elements have to pass through * the same operator instance. * * @param size The size of the windows in number of elements. * @param slide The slide interval in number of elements. */ public AllWindowedStream countWindowAll(long size, long slide) { return windowAll(GlobalWindows.create()) .evictor(CountEvictor.of(size)) .trigger(CountTrigger.of(slide)); } /** * Windows this data stream to a {@code AllWindowedStream}, which evaluates windows over a non * key grouped stream. Elements are put into windows by a {@link * org.apache.flink.streaming.api.windowing.assigners.WindowAssigner}. The grouping of elements * is done by window. * *

A {@link org.apache.flink.streaming.api.windowing.triggers.Trigger} can be defined to * specify when windows are evaluated. However, {@code WindowAssigners} have a default {@code * Trigger} that is used if a {@code Trigger} is not specified. * *

Note: This operation is inherently non-parallel since all elements have to pass through * the same operator instance. * * @param assigner The {@code WindowAssigner} that assigns elements to windows. * @return The trigger windows data stream. */ @PublicEvolving public AllWindowedStream windowAll( WindowAssigner assigner) { return new AllWindowedStream<>(this, assigner); } // ------------------------------------------------------------------------ // Timestamps and watermarks // ------------------------------------------------------------------------ /** * Assigns timestamps to the elements in the data stream and generates watermarks to signal * event time progress. The given {@link WatermarkStrategy} is used to create a {@link * TimestampAssigner} and {@link WatermarkGenerator}. * *

For each event in the data stream, the {@link TimestampAssigner#extractTimestamp(Object, * long)} method is called to assign an event timestamp. * *

For each event in the data stream, the {@link WatermarkGenerator#onEvent(Object, long, * WatermarkOutput)} will be called. * *

Periodically (defined by the {@link ExecutionConfig#getAutoWatermarkInterval()}), the * {@link WatermarkGenerator#onPeriodicEmit(WatermarkOutput)} method will be called. * *

Common watermark generation patterns can be found as static methods in the {@link * org.apache.flink.api.common.eventtime.WatermarkStrategy} class. * * @param watermarkStrategy The strategy to generate watermarks based on event timestamps. * @return The stream after the transformation, with assigned timestamps and watermarks. */ public SingleOutputStreamOperator assignTimestampsAndWatermarks( WatermarkStrategy watermarkStrategy) { final WatermarkStrategy cleanedStrategy = clean(watermarkStrategy); // match parallelism to input, to have a 1:1 source -> timestamps/watermarks relationship // and chain final int inputParallelism = getTransformation().getParallelism(); final TimestampsAndWatermarksTransformation transformation = new TimestampsAndWatermarksTransformation<>( "Timestamps/Watermarks", inputParallelism, getTransformation(), cleanedStrategy); getExecutionEnvironment().addOperator(transformation); return new SingleOutputStreamOperator<>(getExecutionEnvironment(), transformation); } /** * Assigns timestamps to the elements in the data stream and periodically creates watermarks to * signal event time progress. * *

This method uses the deprecated watermark generator interfaces. Please switch to {@link * #assignTimestampsAndWatermarks(WatermarkStrategy)} to use the new interfaces instead. The new * interfaces support watermark idleness and no longer need to differentiate between "periodic" * and "punctuated" watermarks. * * @deprecated Please use {@link #assignTimestampsAndWatermarks(WatermarkStrategy)} instead. */ @Deprecated public SingleOutputStreamOperator assignTimestampsAndWatermarks( AssignerWithPeriodicWatermarks timestampAndWatermarkAssigner) { final AssignerWithPeriodicWatermarks cleanedAssigner = clean(timestampAndWatermarkAssigner); final WatermarkStrategy wms = new AssignerWithPeriodicWatermarksAdapter.Strategy<>(cleanedAssigner); return assignTimestampsAndWatermarks(wms); } /** * Assigns timestamps to the elements in the data stream and creates watermarks based on events, * to signal event time progress. * *

This method uses the deprecated watermark generator interfaces. Please switch to {@link * #assignTimestampsAndWatermarks(WatermarkStrategy)} to use the new interfaces instead. The new * interfaces support watermark idleness and no longer need to differentiate between "periodic" * and "punctuated" watermarks. * * @deprecated Please use {@link #assignTimestampsAndWatermarks(WatermarkStrategy)} instead. */ @Deprecated public SingleOutputStreamOperator assignTimestampsAndWatermarks( AssignerWithPunctuatedWatermarks timestampAndWatermarkAssigner) { final AssignerWithPunctuatedWatermarks cleanedAssigner = clean(timestampAndWatermarkAssigner); final WatermarkStrategy wms = new AssignerWithPunctuatedWatermarksAdapter.Strategy<>(cleanedAssigner); return assignTimestampsAndWatermarks(wms); } // ------------------------------------------------------------------------ // Data sinks // ------------------------------------------------------------------------ /** * Writes a DataStream to the standard output stream (stdout). * *

For each element of the DataStream the result of {@link Object#toString()} is written. * *

NOTE: This will print to stdout on the machine where the code is executed, i.e. the Flink * worker. * * @return The closed DataStream. */ @PublicEvolving public DataStreamSink print() { PrintSinkFunction printFunction = new PrintSinkFunction<>(); return addSink(printFunction).name("Print to Std. Out"); } /** * Writes a DataStream to the standard error stream (stderr). * *

For each element of the DataStream the result of {@link Object#toString()} is written. * *

NOTE: This will print to stderr on the machine where the code is executed, i.e. the Flink * worker. * * @return The closed DataStream. */ @PublicEvolving public DataStreamSink printToErr() { PrintSinkFunction printFunction = new PrintSinkFunction<>(true); return addSink(printFunction).name("Print to Std. Err"); } /** * Writes a DataStream to the standard output stream (stdout). * *

For each element of the DataStream the result of {@link Object#toString()} is written. * *

NOTE: This will print to stdout on the machine where the code is executed, i.e. the Flink * worker. * * @param sinkIdentifier The string to prefix the output with. * @return The closed DataStream. */ @PublicEvolving public DataStreamSink print(String sinkIdentifier) { PrintSinkFunction printFunction = new PrintSinkFunction<>(sinkIdentifier, false); return addSink(printFunction).name("Print to Std. Out"); } /** * Writes a DataStream to the standard error stream (stderr). * *

For each element of the DataStream the result of {@link Object#toString()} is written. * *

NOTE: This will print to stderr on the machine where the code is executed, i.e. the Flink * worker. * * @param sinkIdentifier The string to prefix the output with. * @return The closed DataStream. */ @PublicEvolving public DataStreamSink printToErr(String sinkIdentifier) { PrintSinkFunction printFunction = new PrintSinkFunction<>(sinkIdentifier, true); return addSink(printFunction).name("Print to Std. Err"); } /** * Writes a DataStream to the file specified by path in text format. * *

For every element of the DataStream the result of {@link Object#toString()} is written. * * @param path The path pointing to the location the text file is written to. * @return The closed DataStream. * @deprecated Please use the {@link * org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink} explicitly * using the {@link #addSink(SinkFunction)} method. */ @Deprecated @PublicEvolving public DataStreamSink writeAsText(String path) { return writeUsingOutputFormat(new TextOutputFormat(new Path(path))); } /** * Writes a DataStream to the file specified by path in text format. * *

For every element of the DataStream the result of {@link Object#toString()} is written. * * @param path The path pointing to the location the text file is written to * @param writeMode Controls the behavior for existing files. Options are NO_OVERWRITE and * OVERWRITE. * @return The closed DataStream. * @deprecated Please use the {@link * org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink} explicitly * using the {@link #addSink(SinkFunction)} method. */ @Deprecated @PublicEvolving public DataStreamSink writeAsText(String path, WriteMode writeMode) { TextOutputFormat tof = new TextOutputFormat<>(new Path(path)); tof.setWriteMode(writeMode); return writeUsingOutputFormat(tof); } /** * Writes a DataStream to the file specified by the path parameter. * *

For every field of an element of the DataStream the result of {@link Object#toString()} is * written. This method can only be used on data streams of tuples. * * @param path the path pointing to the location the text file is written to * @return the closed DataStream * @deprecated Please use the {@link * org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink} explicitly * using the {@link #addSink(SinkFunction)} method. */ @Deprecated @PublicEvolving public DataStreamSink writeAsCsv(String path) { return writeAsCsv( path, null, CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER); } /** * Writes a DataStream to the file specified by the path parameter. * *

For every field of an element of the DataStream the result of {@link Object#toString()} is * written. This method can only be used on data streams of tuples. * * @param path the path pointing to the location the text file is written to * @param writeMode Controls the behavior for existing files. Options are NO_OVERWRITE and * OVERWRITE. * @return the closed DataStream * @deprecated Please use the {@link * org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink} explicitly * using the {@link #addSink(SinkFunction)} method. */ @Deprecated @PublicEvolving public DataStreamSink writeAsCsv(String path, WriteMode writeMode) { return writeAsCsv( path, writeMode, CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER); } /** * Writes a DataStream to the file specified by the path parameter. The writing is performed * periodically every millis milliseconds. * *

For every field of an element of the DataStream the result of {@link Object#toString()} is * written. This method can only be used on data streams of tuples. * * @param path the path pointing to the location the text file is written to * @param writeMode Controls the behavior for existing files. Options are NO_OVERWRITE and * OVERWRITE. * @param rowDelimiter the delimiter for two rows * @param fieldDelimiter the delimiter for two fields * @return the closed DataStream * @deprecated Please use the {@link * org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink} explicitly * using the {@link #addSink(SinkFunction)} method. */ @SuppressWarnings("unchecked") @Deprecated @PublicEvolving public DataStreamSink writeAsCsv( String path, WriteMode writeMode, String rowDelimiter, String fieldDelimiter) { Preconditions.checkArgument( getType().isTupleType(), "The writeAsCsv() method can only be used on data streams of tuples."); CsvOutputFormat of = new CsvOutputFormat<>(new Path(path), rowDelimiter, fieldDelimiter); if (writeMode != null) { of.setWriteMode(writeMode); } return writeUsingOutputFormat((OutputFormat) of); } /** * Writes the DataStream to a socket as a byte array. The format of the output is specified by a * {@link SerializationSchema}. * * @param hostName host of the socket * @param port port of the socket * @param schema schema for serialization * @return the closed DataStream */ @PublicEvolving public DataStreamSink writeToSocket( String hostName, int port, SerializationSchema schema) { DataStreamSink returnStream = addSink(new SocketClientSink<>(hostName, port, schema, 0)); returnStream.setParallelism( 1); // It would not work if multiple instances would connect to the same port return returnStream; } /** * Writes the dataStream into an output, described by an OutputFormat. * *

The output is not participating in Flink's checkpointing! * *

For writing to a file system periodically, the use of the {@link * org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink} is recommended. * * @param format The output format * @return The closed DataStream * @deprecated Please use the {@link * org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink} explicitly * using the {@link #addSink(SinkFunction)} method. */ @Deprecated @PublicEvolving public DataStreamSink writeUsingOutputFormat(OutputFormat format) { return addSink(new OutputFormatSinkFunction<>(format)); } /** * Method for passing user defined operators along with the type information that will transform * the DataStream. * * @param operatorName name of the operator, for logging purposes * @param outTypeInfo the output type of the operator * @param operator the object containing the transformation logic * @param type of the return stream * @return the data stream constructed * @see #transform(String, TypeInformation, OneInputStreamOperatorFactory) */ @PublicEvolving public SingleOutputStreamOperator transform( String operatorName, TypeInformation outTypeInfo, OneInputStreamOperator operator) { return doTransform(operatorName, outTypeInfo, SimpleOperatorFactory.of(operator)); } /** * Method for passing user defined operators created by the given factory along with the type * information that will transform the DataStream. * *

This method uses the rather new operator factories and should only be used when custom * factories are needed. * * @param operatorName name of the operator, for logging purposes * @param outTypeInfo the output type of the operator * @param operatorFactory the factory for the operator. * @param type of the return stream * @return the data stream constructed. */ @PublicEvolving public SingleOutputStreamOperator transform( String operatorName, TypeInformation outTypeInfo, OneInputStreamOperatorFactory operatorFactory) { return doTransform(operatorName, outTypeInfo, operatorFactory); } protected SingleOutputStreamOperator doTransform( String operatorName, TypeInformation outTypeInfo, StreamOperatorFactory operatorFactory) { // read the output type of the input Transform to coax out errors about MissingTypeInfo transformation.getOutputType(); OneInputTransformation resultTransform = new OneInputTransformation<>( this.transformation, operatorName, operatorFactory, outTypeInfo, environment.getParallelism()); @SuppressWarnings({"unchecked", "rawtypes"}) SingleOutputStreamOperator returnStream = new SingleOutputStreamOperator(environment, resultTransform); getExecutionEnvironment().addOperator(resultTransform); return returnStream; } /** * Internal function for setting the partitioner for the DataStream. * * @param partitioner Partitioner to set. * @return The modified DataStream. */ protected DataStream setConnectionType(StreamPartitioner partitioner) { return new DataStream<>( this.getExecutionEnvironment(), new PartitionTransformation<>(this.getTransformation(), partitioner)); } /** * Adds the given sink to this DataStream. Only streams with sinks added will be executed once * the {@link StreamExecutionEnvironment#execute()} method is called. * * @param sinkFunction The object containing the sink's invoke function. * @return The closed DataStream. */ public DataStreamSink addSink(SinkFunction sinkFunction) { // read the output type of the input Transform to coax out errors about MissingTypeInfo transformation.getOutputType(); // configure the type if needed if (sinkFunction instanceof InputTypeConfigurable) { ((InputTypeConfigurable) sinkFunction).setInputType(getType(), getExecutionConfig()); } return DataStreamSink.forSinkFunction(this, clean(sinkFunction)); } /** * Adds the given {@link Sink} to this DataStream. Only streams with sinks added will be * executed once the {@link StreamExecutionEnvironment#execute()} method is called. * * @param sink The user defined sink. * @return The closed DataStream. */ @PublicEvolving public DataStreamSink sinkTo(org.apache.flink.api.connector.sink.Sink sink) { return this.sinkTo(sink, CustomSinkOperatorUidHashes.DEFAULT); } /** * Adds the given {@link Sink} to this DataStream. Only streams with sinks added will be * executed once the {@link StreamExecutionEnvironment#execute()} method is called. * *

This method is intended to be used only to recover a snapshot where no uids have been set * before taking the snapshot. * * @param sink The user defined sink. * @return The closed DataStream. */ @PublicEvolving public DataStreamSink sinkTo( org.apache.flink.api.connector.sink.Sink sink, CustomSinkOperatorUidHashes customSinkOperatorUidHashes) { // read the output type of the input Transform to coax out errors about MissingTypeInfo transformation.getOutputType(); return DataStreamSink.forSinkV1(this, sink, customSinkOperatorUidHashes); } /** * Adds the given {@link Sink} to this DataStream. Only streams with sinks added will be * executed once the {@link StreamExecutionEnvironment#execute()} method is called. * * @param sink The user defined sink. * @return The closed DataStream. */ @PublicEvolving public DataStreamSink sinkTo(Sink sink) { return this.sinkTo(sink, CustomSinkOperatorUidHashes.DEFAULT); } /** * Adds the given {@link Sink} to this DataStream. Only streams with sinks added will be * executed once the {@link StreamExecutionEnvironment#execute()} method is called. * *

This method is intended to be used only to recover a snapshot where no uids have been set * before taking the snapshot. * * @param customSinkOperatorUidHashes operator hashes to support state binding * @param sink The user defined sink. * @return The closed DataStream. */ @PublicEvolving public DataStreamSink sinkTo( Sink sink, CustomSinkOperatorUidHashes customSinkOperatorUidHashes) { // read the output type of the input Transform to coax out errors about MissingTypeInfo transformation.getOutputType(); return DataStreamSink.forSink(this, sink, customSinkOperatorUidHashes); } /** * Triggers the distributed execution of the streaming dataflow and returns an iterator over the * elements of the given DataStream. * *

The DataStream application is executed in the regular distributed manner on the target * environment, and the events from the stream are polled back to this application process and * thread through Flink's REST API. * *

IMPORTANT The returned iterator must be closed to free all cluster resources. */ public CloseableIterator executeAndCollect() throws Exception { return executeAndCollect("DataStream Collect"); } /** * Triggers the distributed execution of the streaming dataflow and returns an iterator over the * elements of the given DataStream. * *

The DataStream application is executed in the regular distributed manner on the target * environment, and the events from the stream are polled back to this application process and * thread through Flink's REST API. * *

IMPORTANT The returned iterator must be closed to free all cluster resources. */ public CloseableIterator executeAndCollect(String jobExecutionName) throws Exception { return executeAndCollectWithClient(jobExecutionName).iterator; } /** * Triggers the distributed execution of the streaming dataflow and returns an iterator over the * elements of the given DataStream. * *

The DataStream application is executed in the regular distributed manner on the target * environment, and the events from the stream are polled back to this application process and * thread through Flink's REST API. */ public List executeAndCollect(int limit) throws Exception { return executeAndCollect("DataStream Collect", limit); } /** * Triggers the distributed execution of the streaming dataflow and returns an iterator over the * elements of the given DataStream. * *

The DataStream application is executed in the regular distributed manner on the target * environment, and the events from the stream are polled back to this application process and * thread through Flink's REST API. */ public List executeAndCollect(String jobExecutionName, int limit) throws Exception { Preconditions.checkState(limit > 0, "Limit must be greater than 0"); try (ClientAndIterator clientAndIterator = executeAndCollectWithClient(jobExecutionName)) { List results = new ArrayList<>(limit); while (limit > 0 && clientAndIterator.iterator.hasNext()) { results.add(clientAndIterator.iterator.next()); limit--; } return results; } } /** * Sets up the collection of the elements in this {@link DataStream}, and returns an iterator * over the collected elements that can be used to retrieve elements once the job execution has * started. * *

Caution: When multiple streams are being collected it is recommended to consume all * streams in parallel to not back-pressure the job. * *

Caution: Closing the returned iterator cancels the job! It is recommended to close all * iterators once you are no longer interested in any of the collected streams. * *

This method is functionally equivalent to {@link #collectAsync(Collector)}. * * @return iterator over the contained elements */ @Experimental public CloseableIterator collectAsync() { final Collector collector = new Collector<>(); collectAsync(collector); return collector.getOutput(); } /** * Sets up the collection of the elements in this {@link DataStream}, which can be retrieved * later via the given {@link Collector}. * *

Caution: When multiple streams are being collected it is recommended to consume all * streams in parallel to not back-pressure the job. * *

Caution: Closing the iterator from the collector cancels the job! It is recommended to * close all iterators once you are no longer interested in any of the collected streams. * *

This method is functionally equivalent to {@link #collectAsync()}. * *

This method is meant to support use-cases where the application of a sink is done via a * {@code Consumer>}, where it wouldn't be possible (or inconvenient) to return an * iterator. * * @param collector a collector that can be used to retrieve the elements */ @Experimental public void collectAsync(Collector collector) { TypeSerializer serializer = getType().createSerializer(getExecutionEnvironment().getConfig()); String accumulatorName = "dataStreamCollect_" + UUID.randomUUID().toString(); StreamExecutionEnvironment env = getExecutionEnvironment(); CollectSinkOperatorFactory factory = new CollectSinkOperatorFactory<>(serializer, accumulatorName); CollectSinkOperator operator = (CollectSinkOperator) factory.getOperator(); CollectResultIterator iterator = new CollectResultIterator<>( operator.getOperatorIdFuture(), serializer, accumulatorName, env.getCheckpointConfig()); CollectStreamSink sink = new CollectStreamSink<>(this, factory); sink.name("Data stream collect sink"); env.addOperator(sink.getTransformation()); env.registerCollectIterator(iterator); collector.setIterator(iterator); } /** * This class acts as an accessor to elements collected via {@link #collectAsync(Collector)}. * * @param the element type */ @Experimental public static class Collector { private CloseableIterator iterator; @Internal void setIterator(CloseableIterator iterator) { this.iterator = iterator; } /** * Returns an iterator over the collected elements. The returned iterator must only be used * once the job execution was triggered. * *

This method will always return the same iterator instance. * * @return iterator over collected elements */ public CloseableIterator getOutput() { // we intentionally fail here instead of waiting, because it indicates a // misunderstanding on the user and would usually just block the application Preconditions.checkNotNull(iterator, "The job execution was not yet started."); return iterator; } } ClientAndIterator executeAndCollectWithClient(String jobExecutionName) throws Exception { final CloseableIterator iterator = collectAsync(); final JobClient jobClient = getExecutionEnvironment().executeAsync(jobExecutionName); return new ClientAndIterator<>(jobClient, iterator); } /** * Returns the {@link Transformation} that represents the operation that logically creates this * {@link DataStream}. * * @return The Transformation */ @Internal public Transformation getTransformation() { return transformation; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy