
org.apache.flink.streaming.api.datastream.DataStream Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.streaming.api.datastream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.Partitioner;
import org.apache.flink.api.common.functions.RichFilterFunction;
import org.apache.flink.api.common.functions.RichFlatMapFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.io.OutputFormat;
import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo;
import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.Utils;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.io.CsvOutputFormat;
import org.apache.flink.api.java.io.TextOutputFormat;
import org.apache.flink.api.java.operators.Keys;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.typeutils.InputTypeConfigurable;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.core.fs.FileSystem.WriteMode;
import org.apache.flink.core.fs.Path;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.TimestampExtractor;
import org.apache.flink.streaming.api.functions.sink.FileSinkFunctionByMillis;
import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.api.functions.sink.SocketClientSink;
import org.apache.flink.streaming.api.operators.OneInputStreamOperator;
import org.apache.flink.streaming.api.operators.StreamFilter;
import org.apache.flink.streaming.api.operators.StreamFlatMap;
import org.apache.flink.streaming.api.operators.StreamMap;
import org.apache.flink.streaming.api.operators.StreamSink;
import org.apache.flink.streaming.api.transformations.OneInputTransformation;
import org.apache.flink.streaming.api.transformations.PartitionTransformation;
import org.apache.flink.streaming.api.transformations.StreamTransformation;
import org.apache.flink.streaming.api.transformations.UnionTransformation;
import org.apache.flink.streaming.api.windowing.assigners.GlobalWindows;
import org.apache.flink.streaming.api.windowing.assigners.SlidingTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.TumblingTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.WindowAssigner;
import org.apache.flink.streaming.api.windowing.evictors.CountEvictor;
import org.apache.flink.streaming.api.windowing.time.AbstractTime;
import org.apache.flink.streaming.api.windowing.triggers.CountTrigger;
import org.apache.flink.streaming.api.windowing.triggers.PurgingTrigger;
import org.apache.flink.streaming.api.windowing.windows.GlobalWindow;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.api.windowing.windows.Window;
import org.apache.flink.streaming.runtime.operators.ExtractTimestampsOperator;
import org.apache.flink.streaming.runtime.partitioner.BroadcastPartitioner;
import org.apache.flink.streaming.runtime.partitioner.CustomPartitionerWrapper;
import org.apache.flink.streaming.runtime.partitioner.ForwardPartitioner;
import org.apache.flink.streaming.runtime.partitioner.RebalancePartitioner;
import org.apache.flink.streaming.runtime.partitioner.HashPartitioner;
import org.apache.flink.streaming.runtime.partitioner.GlobalPartitioner;
import org.apache.flink.streaming.runtime.partitioner.ShufflePartitioner;
import org.apache.flink.streaming.runtime.partitioner.StreamPartitioner;
import org.apache.flink.streaming.util.keys.KeySelectorUtil;
import org.apache.flink.streaming.util.serialization.SerializationSchema;
import com.google.common.base.Preconditions;
/**
* A DataStream represents a stream of elements of the same type. A DataStream
* can be transformed into another DataStream by applying a transformation as
* for example:
*
* - {@link DataStream#map},
*
- {@link DataStream#filter}, or
*
*
* @param The type of the elements in this Stream
*/
public class DataStream {
protected final StreamExecutionEnvironment environment;
protected final StreamTransformation transformation;
/**
* Create a new {@link DataStream} in the given execution environment with
* partitioning set to forward by default.
*
* @param environment The StreamExecutionEnvironment
*/
public DataStream(StreamExecutionEnvironment environment, StreamTransformation transformation) {
this.environment = Preconditions.checkNotNull(environment, "Execution Environment must not be null.");
this.transformation = Preconditions.checkNotNull(transformation, "Stream Transformation must not be null.");
}
/**
* Returns the ID of the {@link DataStream} in the current {@link StreamExecutionEnvironment}.
*
* @return ID of the DataStream
*/
public Integer getId() {
return transformation.getId();
}
/**
* Gets the parallelism for this operator.
*
* @return The parallelism set for this operator.
*/
public int getParallelism() {
return transformation.getParallelism();
}
/**
* Gets the type of the stream.
*
* @return The type of the datastream.
*/
public TypeInformation getType() {
return transformation.getOutputType();
}
/**
* Invokes the {@link org.apache.flink.api.java.ClosureCleaner}
* on the given function if closure cleaning is enabled in the {@link ExecutionConfig}.
*
* @return The cleaned Function
*/
protected F clean(F f) {
return getExecutionEnvironment().clean(f);
}
/**
* Returns the {@link StreamExecutionEnvironment} that was used to create this
* {@link DataStream}
*
* @return The Execution Environment
*/
public StreamExecutionEnvironment getExecutionEnvironment() {
return environment;
}
public ExecutionConfig getExecutionConfig() {
return environment.getConfig();
}
/**
* Creates a new {@link DataStream} by merging {@link DataStream} outputs of
* the same type with each other. The DataStreams merged using this operator
* will be transformed simultaneously.
*
* @param streams
* The DataStreams to union output with.
* @return The {@link DataStream}.
*/
@SafeVarargs
public final DataStream union(DataStream... streams) {
List> unionedTransforms = new ArrayList<>();
unionedTransforms.add(this.transformation);
Collection> thisPredecessors = this.getTransformation().getTransitivePredecessors();
for (DataStream newStream : streams) {
if (!(newStream.getParallelism() == this.getParallelism())) {
throw new UnsupportedClassVersionError(
"DataStream can only be unioned with DataStreams of the same parallelism. " +
"This Stream: " + this.getTransformation() +
", other stream: " + newStream.getTransformation());
}
if (!getType().equals(newStream.getType())) {
throw new IllegalArgumentException("Cannot union streams of different types: "
+ getType() + " and " + newStream.getType());
}
Collection> predecessors = newStream.getTransformation().getTransitivePredecessors();
if (predecessors.contains(this.transformation) || thisPredecessors.contains(newStream.getTransformation())) {
throw new UnsupportedOperationException("A DataStream cannot be unioned with itself");
}
unionedTransforms.add(newStream.getTransformation());
}
return new DataStream(this.environment, new UnionTransformation(unionedTransforms));
}
/**
* Operator used for directing tuples to specific named outputs using an
* {@link org.apache.flink.streaming.api.collector.selector.OutputSelector}.
* Calling this method on an operator creates a new {@link SplitStream}.
*
* @param outputSelector
* The user defined
* {@link org.apache.flink.streaming.api.collector.selector.OutputSelector}
* for directing the tuples.
* @return The {@link SplitStream}
*/
public SplitStream split(OutputSelector outputSelector) {
return new SplitStream(this, clean(outputSelector));
}
/**
* Creates a new {@link ConnectedStreams} by connecting
* {@link DataStream} outputs of (possible) different types with each other.
* The DataStreams connected using this operator can be used with
* CoFunctions to apply joint transformations.
*
* @param dataStream
* The DataStream with which this stream will be connected.
* @return The {@link ConnectedStreams}.
*/
public ConnectedStreams connect(DataStream dataStream) {
return new ConnectedStreams(environment, this, dataStream);
}
/**
*
* It creates a new {@link KeyedStream} that uses the provided key for partitioning
* its operator states.
*
* @param key
* The KeySelector to be used for extracting the key for partitioning
* @return The {@link DataStream} with partitioned state (i.e. KeyedStream)
*/
public KeyedStream keyBy(KeySelector key) {
return new KeyedStream(this, clean(key));
}
/**
* Partitions the operator state of a {@link DataStream} by the given key positions.
*
* @param fields
* The position of the fields on which the {@link DataStream}
* will be grouped.
* @return The {@link DataStream} with partitioned state (i.e. KeyedStream)
*/
public KeyedStream keyBy(int... fields) {
if (getType() instanceof BasicArrayTypeInfo || getType() instanceof PrimitiveArrayTypeInfo) {
return keyBy(KeySelectorUtil.getSelectorForArray(fields, getType()));
} else {
return keyBy(new Keys.ExpressionKeys(fields, getType()));
}
}
/**
* Partitions the operator state of a {@link DataStream}using field expressions.
* A field expression is either the name of a public field or a getter method with parentheses
* of the {@link DataStream}S underlying type. A dot can be used to drill
* down into objects, as in {@code "field1.getInnerField2()" }.
*
* @param fields
* One or more field expressions on which the state of the {@link DataStream} operators will be
* partitioned.
* @return The {@link DataStream} with partitioned state (i.e. KeyedStream)
**/
public KeyedStream keyBy(String... fields) {
return keyBy(new Keys.ExpressionKeys(fields, getType()));
}
private KeyedStream keyBy(Keys keys) {
return new KeyedStream(this, clean(KeySelectorUtil.getSelectorForKeys(keys,
getType(), getExecutionConfig())));
}
/**
* Sets the partitioning of the {@link DataStream} so that the output is
* partitioned hashing on the given fields. This setting only
* effects the how the outputs will be distributed between the parallel
* instances of the next processing operator.
*
* @param fields The tuple fields that should be used for partitioning
* @return The partitioned DataStream
*
*/
public DataStream partitionByHash(int... fields) {
if (getType() instanceof BasicArrayTypeInfo || getType() instanceof PrimitiveArrayTypeInfo) {
return partitionByHash(KeySelectorUtil.getSelectorForArray(fields, getType()));
} else {
return partitionByHash(new Keys.ExpressionKeys(fields, getType()));
}
}
/**
* Sets the partitioning of the {@link DataStream} so that the output is
* partitioned hashing on the given fields. This setting only
* effects the how the outputs will be distributed between the parallel
* instances of the next processing operator.
*
* @param fields The tuple fields that should be used for partitioning
* @return The partitioned DataStream
*
*/
public DataStream partitionByHash(String... fields) {
return partitionByHash(new Keys.ExpressionKeys(fields, getType()));
}
/**
* Sets the partitioning of the {@link DataStream} so that the output is
* partitioned using the given {@link KeySelector}. This setting only
* effects the how the outputs will be distributed between the parallel
* instances of the next processing operator.
*
* @param keySelector The function that extracts the key from an element in the Stream
* @return The partitioned DataStream
*/
public DataStream partitionByHash(KeySelector keySelector) {
return setConnectionType(new HashPartitioner(clean(keySelector)));
}
//private helper method for partitioning
private DataStream partitionByHash(Keys keys) {
KeySelector keySelector = clean(KeySelectorUtil.getSelectorForKeys(
keys,
getType(),
getExecutionConfig()));
return setConnectionType(new HashPartitioner(keySelector));
}
/**
* Partitions a tuple DataStream on the specified key fields using a custom partitioner.
* This method takes the key position to partition on, and a partitioner that accepts the key type.
*
* Note: This method works only on single field keys.
*
* @param partitioner The partitioner to assign partitions to keys.
* @param field The field index on which the DataStream is to partitioned.
* @return The partitioned DataStream.
*/
public DataStream partitionCustom(Partitioner partitioner, int field) {
Keys.ExpressionKeys outExpressionKeys = new Keys.ExpressionKeys(new int[]{field}, getType());
return partitionCustom(partitioner, outExpressionKeys);
}
/**
* Partitions a POJO DataStream on the specified key fields using a custom partitioner.
* This method takes the key expression to partition on, and a partitioner that accepts the key type.
*
* Note: This method works only on single field keys.
*
* @param partitioner The partitioner to assign partitions to keys.
* @param field The field index on which the DataStream is to partitioned.
* @return The partitioned DataStream.
*/
public DataStream partitionCustom(Partitioner partitioner, String field) {
Keys.ExpressionKeys outExpressionKeys = new Keys.ExpressionKeys(new String[]{field}, getType());
return partitionCustom(partitioner, outExpressionKeys);
}
/**
* Partitions a DataStream on the key returned by the selector, using a custom partitioner.
* This method takes the key selector to get the key to partition on, and a partitioner that
* accepts the key type.
*
* Note: This method works only on single field keys, i.e. the selector cannot return tuples
* of fields.
*
* @param partitioner
* The partitioner to assign partitions to keys.
* @param keySelector
* The KeySelector with which the DataStream is partitioned.
* @return The partitioned DataStream.
* @see KeySelector
*/
public DataStream partitionCustom(Partitioner partitioner, KeySelector keySelector) {
return setConnectionType(new CustomPartitionerWrapper(clean(partitioner),
clean(keySelector)));
}
// private helper method for custom partitioning
private DataStream partitionCustom(Partitioner partitioner, Keys keys) {
KeySelector keySelector = KeySelectorUtil.getSelectorForOneKey(keys, partitioner, getType(), getExecutionConfig());
return setConnectionType(
new CustomPartitionerWrapper(
clean(partitioner),
clean(keySelector)));
}
/**
* Sets the partitioning of the {@link DataStream} so that the output tuples
* are broadcasted to every parallel instance of the next component.
*
*
* This setting only effects the how the outputs will be distributed between
* the parallel instances of the next processing operator.
*
* @return The DataStream with broadcast partitioning set.
*/
public DataStream broadcast() {
return setConnectionType(new BroadcastPartitioner());
}
/**
* Sets the partitioning of the {@link DataStream} so that the output tuples
* are shuffled uniformly randomly to the next component.
*
*
* This setting only effects the how the outputs will be distributed between
* the parallel instances of the next processing operator.
*
* @return The DataStream with shuffle partitioning set.
*/
public DataStream shuffle() {
return setConnectionType(new ShufflePartitioner());
}
/**
* Sets the partitioning of the {@link DataStream} so that the output tuples
* are forwarded to the local subtask of the next component (whenever
* possible).
*
*
* This setting only effects the how the outputs will be distributed between
* the parallel instances of the next processing operator.
*
* @return The DataStream with forward partitioning set.
*/
public DataStream forward() {
return setConnectionType(new ForwardPartitioner());
}
/**
* Sets the partitioning of the {@link DataStream} so that the output tuples
* are distributed evenly to instances of the next component in a Round-robin
* fashion.
*
*
* This setting only effects the how the outputs will be distributed between
* the parallel instances of the next processing operator.
*
* @return The DataStream with rebalance partitioning set.
*/
public DataStream rebalance() {
return setConnectionType(new RebalancePartitioner());
}
/**
* Sets the partitioning of the {@link DataStream} so that the output values
* all go to the first instance of the next processing operator. Use this
* setting with care since it might cause a serious performance bottleneck
* in the application.
*
* @return The DataStream with shuffle partitioning set.
*/
public DataStream global() {
return setConnectionType(new GlobalPartitioner());
}
/**
* Initiates an iterative part of the program that feeds back data streams.
* The iterative part needs to be closed by calling
* {@link IterativeStream#closeWith(DataStream)}. The transformation of
* this IterativeStream will be the iteration head. The data stream
* given to the {@link IterativeStream#closeWith(DataStream)} method is
* the data stream that will be fed back and used as the input for the
* iteration head. The user can also use different feedback type than the
* input of the iteration and treat the input and feedback streams as a
* {@link ConnectedStreams} be calling
* {@link IterativeStream#withFeedbackType(TypeInformation)}
*
* A common usage pattern for streaming iterations is to use output
* splitting to send a part of the closing data stream to the head. Refer to
* {@link #split(OutputSelector)} for more information.
*
* The iteration edge will be partitioned the same way as the first input of
* the iteration head unless it is changed in the
* {@link IterativeStream#closeWith(DataStream)} call.
*
* By default a DataStream with iteration will never terminate, but the user
* can use the maxWaitTime parameter to set a max waiting time for the
* iteration head. If no data received in the set time, the stream
* terminates.
*
* @return The iterative data stream created.
*/
public IterativeStream iterate() {
return new IterativeStream(this, 0);
}
/**
* Initiates an iterative part of the program that feeds back data streams.
* The iterative part needs to be closed by calling
* {@link IterativeStream#closeWith(DataStream)}. The transformation of
* this IterativeStream will be the iteration head. The data stream
* given to the {@link IterativeStream#closeWith(DataStream)} method is
* the data stream that will be fed back and used as the input for the
* iteration head. The user can also use different feedback type than the
* input of the iteration and treat the input and feedback streams as a
* {@link ConnectedStreams} be calling
* {@link IterativeStream#withFeedbackType(TypeInformation)}
*
* A common usage pattern for streaming iterations is to use output
* splitting to send a part of the closing data stream to the head. Refer to
* {@link #split(OutputSelector)} for more information.
*
* The iteration edge will be partitioned the same way as the first input of
* the iteration head unless it is changed in the
* {@link IterativeStream#closeWith(DataStream)} call.
*
* By default a DataStream with iteration will never terminate, but the user
* can use the maxWaitTime parameter to set a max waiting time for the
* iteration head. If no data received in the set time, the stream
* terminates.
*
* @param maxWaitTimeMillis
* Number of milliseconds to wait between inputs before shutting
* down
*
* @return The iterative data stream created.
*/
public IterativeStream iterate(long maxWaitTimeMillis) {
return new IterativeStream(this, maxWaitTimeMillis);
}
/**
* Applies a Map transformation on a {@link DataStream}. The transformation
* calls a {@link MapFunction} for each element of the DataStream. Each
* MapFunction call returns exactly one element. The user can also extend
* {@link RichMapFunction} to gain access to other features provided by the
* {@link org.apache.flink.api.common.functions.RichFunction} interface.
*
* @param mapper
* The MapFunction that is called for each element of the
* DataStream.
* @param
* output type
* @return The transformed {@link DataStream}.
*/
public SingleOutputStreamOperator map(MapFunction mapper) {
TypeInformation outType = TypeExtractor.getMapReturnTypes(clean(mapper), getType(),
Utils.getCallLocationName(), true);
return transform("Map", outType, new StreamMap(clean(mapper)));
}
/**
* Applies a FlatMap transformation on a {@link DataStream}. The
* transformation calls a {@link FlatMapFunction} for each element of the
* DataStream. Each FlatMapFunction call can return any number of elements
* including none. The user can also extend {@link RichFlatMapFunction} to
* gain access to other features provided by the
* {@link org.apache.flink.api.common.functions.RichFunction} interface.
*
* @param flatMapper
* The FlatMapFunction that is called for each element of the
* DataStream
*
* @param
* output type
* @return The transformed {@link DataStream}.
*/
public SingleOutputStreamOperator flatMap(FlatMapFunction flatMapper) {
TypeInformation outType = TypeExtractor.getFlatMapReturnTypes(clean(flatMapper),
getType(), Utils.getCallLocationName(), true);
return transform("Flat Map", outType, new StreamFlatMap(clean(flatMapper)));
}
/**
* Applies a Filter transformation on a {@link DataStream}. The
* transformation calls a {@link FilterFunction} for each element of the
* DataStream and retains only those element for which the function returns
* true. Elements for which the function returns false are filtered. The
* user can also extend {@link RichFilterFunction} to gain access to other
* features provided by the
* {@link org.apache.flink.api.common.functions.RichFunction} interface.
*
* @param filter
* The FilterFunction that is called for each element of the
* DataStream.
* @return The filtered DataStream.
*/
public SingleOutputStreamOperator filter(FilterFunction filter) {
return transform("Filter", getType(), new StreamFilter(clean(filter)));
}
/**
* Initiates a Project transformation on a {@link Tuple} {@link DataStream}.
* Note: Only Tuple DataStreams can be projected.
*
*
* The transformation projects each Tuple of the DataSet onto a (sub)set of
* fields.
*
* @param fieldIndexes
* The field indexes of the input tuples that are retained. The
* order of fields in the output tuple corresponds to the order
* of field indexes.
* @return The projected DataStream
*
* @see Tuple
* @see DataStream
*/
public SingleOutputStreamOperator project(int... fieldIndexes) {
return new StreamProjection(this, fieldIndexes).projectTupleX();
}
/**
* Creates a join operation. See {@link CoGroupedStreams} for an example of how the keys
* and window can be specified.
*/
public CoGroupedStreams coGroup(DataStream otherStream) {
return new CoGroupedStreams<>(this, otherStream);
}
/**
* Creates a join operation. See {@link JoinedStreams} for an example of how the keys
* and window can be specified.
*/
public JoinedStreams join(DataStream otherStream) {
return new JoinedStreams<>(this, otherStream);
}
/**
* Windows this {@code DataStream} into tumbling time windows.
*
*
* This is a shortcut for either {@code .window(TumblingTimeWindows.of(size))} or
* {@code .window(TumblingProcessingTimeWindows.of(size))} depending on the time characteristic
* set using
*
*
* Note: This operation can be inherently non-parallel since all elements have to pass through
* the same operator instance. (Only for special cases, such as aligned time windows is
* it possible to perform this operation in parallel).
*
* {@link org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#setStreamTimeCharacteristic(org.apache.flink.streaming.api.TimeCharacteristic)}
*
* @param size The size of the window.
*/
public AllWindowedStream timeWindowAll(AbstractTime size) {
return windowAll(TumblingTimeWindows.of(size));
}
/**
* Windows this {@code DataStream} into sliding time windows.
*
*
* This is a shortcut for either {@code .window(SlidingTimeWindows.of(size, slide))} or
* {@code .window(SlidingProcessingTimeWindows.of(size, slide))} depending on the time characteristic
* set using
* {@link org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#setStreamTimeCharacteristic(org.apache.flink.streaming.api.TimeCharacteristic)}
*
*
* Note: This operation can be inherently non-parallel since all elements have to pass through
* the same operator instance. (Only for special cases, such as aligned time windows is
* it possible to perform this operation in parallel).
*
* @param size The size of the window.
*/
public AllWindowedStream timeWindowAll(AbstractTime size, AbstractTime slide) {
return windowAll(SlidingTimeWindows.of(size, slide));
}
/**
* Windows this {@code DataStream} into tumbling count windows.
*
*
* Note: This operation can be inherently non-parallel since all elements have to pass through
* the same operator instance. (Only for special cases, such as aligned time windows is
* it possible to perform this operation in parallel).
*
* @param size The size of the windows in number of elements.
*/
public AllWindowedStream countWindowAll(long size) {
return windowAll(GlobalWindows.create()).trigger(PurgingTrigger.of(CountTrigger.of(size)));
}
/**
* Windows this {@code DataStream} into sliding count windows.
*
*
* Note: This operation can be inherently non-parallel since all elements have to pass through
* the same operator instance. (Only for special cases, such as aligned time windows is
* it possible to perform this operation in parallel).
*
* @param size The size of the windows in number of elements.
* @param slide The slide interval in number of elements.
*/
public AllWindowedStream countWindowAll(long size, long slide) {
return windowAll(GlobalWindows.create())
.evictor(CountEvictor.of(size))
.trigger(CountTrigger.of(slide));
}
/**
* Windows this data stream to a {@code KeyedTriggerWindowDataStream}, which evaluates windows
* over a key grouped stream. Elements are put into windows by a
* {@link org.apache.flink.streaming.api.windowing.assigners.WindowAssigner}. The grouping of
* elements is done both by key and by window.
*
*
* A {@link org.apache.flink.streaming.api.windowing.triggers.Trigger} can be defined to specify
* when windows are evaluated. However, {@code WindowAssigners} have a default {@code Trigger}
* that is used if a {@code Trigger} is not specified.
*
*
* Note: This operation can be inherently non-parallel since all elements have to pass through
* the same operator instance. (Only for special cases, such as aligned time windows is
* it possible to perform this operation in parallel).
*
* @param assigner The {@code WindowAssigner} that assigns elements to windows.
* @return The trigger windows data stream.
*/
public AllWindowedStream windowAll(WindowAssigner super T, W> assigner) {
return new AllWindowedStream<>(this, assigner);
}
/**
* Extracts a timestamp from an element and assigns it as the internal timestamp of that element.
* The internal timestamps are, for example, used to to event-time window operations.
*
*
* If you know that the timestamps are strictly increasing you can use an
* {@link org.apache.flink.streaming.api.functions.AscendingTimestampExtractor}. Otherwise,
* you should provide a {@link TimestampExtractor} that also implements
* {@link TimestampExtractor#getCurrentWatermark()} to keep track of watermarks.
*
* @see org.apache.flink.streaming.api.watermark.Watermark
*
* @param extractor The TimestampExtractor that is called for each element of the DataStream.
*/
public SingleOutputStreamOperator assignTimestamps(TimestampExtractor extractor) {
// match parallelism to input, otherwise dop=1 sources could lead to some strange
// behaviour: the watermark will creep along very slowly because the elements
// from the source go to each extraction operator round robin.
int inputParallelism = getTransformation().getParallelism();
ExtractTimestampsOperator operator = new ExtractTimestampsOperator<>(clean(extractor));
return transform("ExtractTimestamps", getTransformation().getOutputType(), operator)
.setParallelism(inputParallelism);
}
/**
* Writes a DataStream to the standard output stream (stdout).
*
*
* For each element of the DataStream the result of
* {@link Object#toString()} is written.
*
* @return The closed DataStream.
*/
public DataStreamSink print() {
PrintSinkFunction printFunction = new PrintSinkFunction();
return addSink(printFunction);
}
/**
* Writes a DataStream to the standard output stream (stderr).
*
*
* For each element of the DataStream the result of
* {@link Object#toString()} is written.
*
* @return The closed DataStream.
*/
public DataStreamSink printToErr() {
PrintSinkFunction printFunction = new PrintSinkFunction(true);
return addSink(printFunction);
}
/**
* Writes a DataStream to the file specified by path in text format.
*
*
* For every element of the DataStream the result of {@link Object#toString()}
* is written.
*
* @param path
* the path pointing to the location the text file is written to
*
* @return the closed DataStream.
*/
public DataStreamSink writeAsText(String path) {
return write(new TextOutputFormat(new Path(path)), 0L);
}
/**
* Writes a DataStream to the file specified by path in text format. The
* writing is performed periodically, in every millis milliseconds.
*
*
* For every element of the DataStream the result of {@link Object#toString()}
* is written.
*
* @param path
* the path pointing to the location the text file is written to
* @param millis
* the file update frequency
*
* @return the closed DataStream
*/
public DataStreamSink writeAsText(String path, long millis) {
TextOutputFormat tof = new TextOutputFormat(new Path(path));
return write(tof, millis);
}
/**
* Writes a DataStream to the file specified by path in text format.
*
*
* For every element of the DataStream the result of {@link Object#toString()}
* is written.
*
* @param path
* the path pointing to the location the text file is written to
* @param writeMode
* Control the behavior for existing files. Options are
* NO_OVERWRITE and OVERWRITE.
*
* @return the closed DataStream.
*/
public DataStreamSink writeAsText(String path, WriteMode writeMode) {
TextOutputFormat tof = new TextOutputFormat(new Path(path));
tof.setWriteMode(writeMode);
return write(tof, 0L);
}
/**
* Writes a DataStream to the file specified by path in text format.
*
*
* For every element of the DataStream the result of {@link Object#toString()}
* is written.
*
* @param path
* the path pointing to the location the text file is written to
* @param writeMode
* Controls the behavior for existing files. Options are
* NO_OVERWRITE and OVERWRITE.
* @param millis
* the file update frequency
*
* @return the closed DataStream.
*/
public DataStreamSink writeAsText(String path, WriteMode writeMode, long millis) {
TextOutputFormat tof = new TextOutputFormat(new Path(path));
tof.setWriteMode(writeMode);
return write(tof, millis);
}
/**
* Writes a DataStream to the file specified by path in csv format.
*
*
* For every field of an element of the DataStream the result of {@link Object#toString()}
* is written. This method can only be used on data streams of tuples.
*
* @param path
* the path pointing to the location the text file is written to
*
* @return the closed DataStream
*/
@SuppressWarnings("unchecked")
public DataStreamSink writeAsCsv(String path) {
Preconditions.checkArgument(getType().isTupleType(),
"The writeAsCsv() method can only be used on data sets of tuples.");
CsvOutputFormat of = new CsvOutputFormat(new Path(path),
CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER);
return write((OutputFormat) of, 0L);
}
/**
* Writes a DataStream to the file specified by path in csv format. The
* writing is performed periodically, in every millis milliseconds.
*
*
* For every field of an element of the DataStream the result of {@link Object#toString()}
* is written. This method can only be used on data streams of tuples.
*
* @param path
* the path pointing to the location the text file is written to
* @param millis
* the file update frequency
*
* @return the closed DataStream
*/
@SuppressWarnings("unchecked")
public DataStreamSink writeAsCsv(String path, long millis) {
Preconditions.checkArgument(getType().isTupleType(),
"The writeAsCsv() method can only be used on data sets of tuples.");
CsvOutputFormat of = new CsvOutputFormat(new Path(path),
CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER);
return write((OutputFormat) of, millis);
}
/**
* Writes a DataStream to the file specified by path in csv format.
*
*
* For every field of an element of the DataStream the result of {@link Object#toString()}
* is written. This method can only be used on data streams of tuples.
*
* @param path
* the path pointing to the location the text file is written to
* @param writeMode
* Controls the behavior for existing files. Options are
* NO_OVERWRITE and OVERWRITE.
*
* @return the closed DataStream
*/
@SuppressWarnings("unchecked")
public DataStreamSink writeAsCsv(String path, WriteMode writeMode) {
Preconditions.checkArgument(getType().isTupleType(),
"The writeAsCsv() method can only be used on data sets of tuples.");
CsvOutputFormat of = new CsvOutputFormat(new Path(path),
CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER);
if (writeMode != null) {
of.setWriteMode(writeMode);
}
return write((OutputFormat) of, 0L);
}
/**
* Writes a DataStream to the file specified by path in csv format. The
* writing is performed periodically, in every millis milliseconds.
*
*
* For every field of an element of the DataStream the result of {@link Object#toString()}
* is written. This method can only be used on data streams of tuples.
*
* @param path
* the path pointing to the location the text file is written to
* @param writeMode
* Controls the behavior for existing files. Options are
* NO_OVERWRITE and OVERWRITE.
* @param millis
* the file update frequency
*
* @return the closed DataStream
*/
@SuppressWarnings("unchecked")
public DataStreamSink writeAsCsv(String path, WriteMode writeMode,
long millis) {
Preconditions.checkArgument(getType().isTupleType(),
"The writeAsCsv() method can only be used on data sets of tuples.");
CsvOutputFormat of = new CsvOutputFormat(new Path(path),
CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER);
if (writeMode != null) {
of.setWriteMode(writeMode);
}
return write((OutputFormat) of, millis);
}
/**
* Writes the DataStream to a socket as a byte array. The format of the
* output is specified by a {@link SerializationSchema}.
*
* @param hostName
* host of the socket
* @param port
* port of the socket
* @param schema
* schema for serialization
* @return the closed DataStream
*/
public DataStreamSink writeToSocket(String hostName, int port, SerializationSchema schema) {
DataStreamSink returnStream = addSink(new SocketClientSink(hostName, port, schema, 0));
returnStream.setParallelism(1); // It would not work if multiple instances would connect to the same port
return returnStream;
}
/**
* Writes the dataStream into an output, described by an OutputFormat.
*
* @param format The output format
* @param millis the write frequency
* @return The closed DataStream
*/
public DataStreamSink write(OutputFormat format, long millis) {
return addSink(new FileSinkFunctionByMillis(format, millis));
}
/**
* Method for passing user defined operators along with the type
* information that will transform the DataStream.
*
* @param operatorName
* name of the operator, for logging purposes
* @param outTypeInfo
* the output type of the operator
* @param operator
* the object containing the transformation logic
* @param
* type of the return stream
* @return the data stream constructed
*/
public SingleOutputStreamOperator transform(String operatorName, TypeInformation outTypeInfo, OneInputStreamOperator operator) {
// read the output type of the input Transform to coax out errors about MissingTypeInfo
transformation.getOutputType();
OneInputTransformation resultTransform = new OneInputTransformation(
this.transformation,
operatorName,
operator,
outTypeInfo,
environment.getParallelism());
@SuppressWarnings({ "unchecked", "rawtypes" })
SingleOutputStreamOperator returnStream = new SingleOutputStreamOperator(environment, resultTransform);
getExecutionEnvironment().addOperator(resultTransform);
return returnStream;
}
/**
* Internal function for setting the partitioner for the DataStream
*
* @param partitioner
* Partitioner to set.
* @return The modified DataStream.
*/
protected DataStream setConnectionType(StreamPartitioner partitioner) {
return new DataStream(this.getExecutionEnvironment(), new PartitionTransformation(this.getTransformation(), partitioner));
}
/**
* Adds the given sink to this DataStream. Only streams with sinks added
* will be executed once the {@link StreamExecutionEnvironment#execute()}
* method is called.
*
* @param sinkFunction
* The object containing the sink's invoke function.
* @return The closed DataStream.
*/
public DataStreamSink addSink(SinkFunction sinkFunction) {
// read the output type of the input Transform to coax out errors about MissingTypeInfo
transformation.getOutputType();
// configure the type if needed
if (sinkFunction instanceof InputTypeConfigurable) {
((InputTypeConfigurable) sinkFunction).setInputType(getType(), getExecutionConfig() );
}
StreamSink sinkOperator = new StreamSink(clean(sinkFunction));
DataStreamSink sink = new DataStreamSink(this, sinkOperator);
getExecutionEnvironment().addOperator(sink.getTransformation());
return sink;
}
/**
* Returns the {@link StreamTransformation} that represents the operation that logically creates
* this {@link DataStream}.
*
* @return The Transformation
*/
public StreamTransformation getTransformation() {
return transformation;
}
}