All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.streaming.api.datastream.DataStream Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.api.datastream;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.Partitioner;
import org.apache.flink.api.common.functions.RichFilterFunction;
import org.apache.flink.api.common.functions.RichFlatMapFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.io.OutputFormat;
import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo;
import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.Utils;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.io.CsvOutputFormat;
import org.apache.flink.api.java.io.TextOutputFormat;
import org.apache.flink.api.java.operators.Keys;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.typeutils.InputTypeConfigurable;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.core.fs.FileSystem.WriteMode;
import org.apache.flink.core.fs.Path;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.TimestampExtractor;
import org.apache.flink.streaming.api.functions.sink.FileSinkFunctionByMillis;
import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.api.functions.sink.SocketClientSink;
import org.apache.flink.streaming.api.operators.OneInputStreamOperator;
import org.apache.flink.streaming.api.operators.StreamFilter;
import org.apache.flink.streaming.api.operators.StreamFlatMap;
import org.apache.flink.streaming.api.operators.StreamMap;
import org.apache.flink.streaming.api.operators.StreamSink;
import org.apache.flink.streaming.api.transformations.OneInputTransformation;
import org.apache.flink.streaming.api.transformations.PartitionTransformation;
import org.apache.flink.streaming.api.transformations.StreamTransformation;
import org.apache.flink.streaming.api.transformations.UnionTransformation;
import org.apache.flink.streaming.api.windowing.assigners.GlobalWindows;
import org.apache.flink.streaming.api.windowing.assigners.SlidingTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.TumblingTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.WindowAssigner;
import org.apache.flink.streaming.api.windowing.evictors.CountEvictor;
import org.apache.flink.streaming.api.windowing.time.AbstractTime;
import org.apache.flink.streaming.api.windowing.triggers.CountTrigger;
import org.apache.flink.streaming.api.windowing.triggers.PurgingTrigger;
import org.apache.flink.streaming.api.windowing.windows.GlobalWindow;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.api.windowing.windows.Window;
import org.apache.flink.streaming.runtime.operators.ExtractTimestampsOperator;
import org.apache.flink.streaming.runtime.partitioner.BroadcastPartitioner;
import org.apache.flink.streaming.runtime.partitioner.CustomPartitionerWrapper;
import org.apache.flink.streaming.runtime.partitioner.ForwardPartitioner;
import org.apache.flink.streaming.runtime.partitioner.RebalancePartitioner;
import org.apache.flink.streaming.runtime.partitioner.HashPartitioner;
import org.apache.flink.streaming.runtime.partitioner.GlobalPartitioner;
import org.apache.flink.streaming.runtime.partitioner.ShufflePartitioner;
import org.apache.flink.streaming.runtime.partitioner.StreamPartitioner;
import org.apache.flink.streaming.util.keys.KeySelectorUtil;
import org.apache.flink.streaming.util.serialization.SerializationSchema;

import com.google.common.base.Preconditions;

/**
 * A DataStream represents a stream of elements of the same type. A DataStream
 * can be transformed into another DataStream by applying a transformation as
 * for example:
 * 
    *
  • {@link DataStream#map}, *
  • {@link DataStream#filter}, or *
* * @param The type of the elements in this Stream */ public class DataStream { protected final StreamExecutionEnvironment environment; protected final StreamTransformation transformation; /** * Create a new {@link DataStream} in the given execution environment with * partitioning set to forward by default. * * @param environment The StreamExecutionEnvironment */ public DataStream(StreamExecutionEnvironment environment, StreamTransformation transformation) { this.environment = Preconditions.checkNotNull(environment, "Execution Environment must not be null."); this.transformation = Preconditions.checkNotNull(transformation, "Stream Transformation must not be null."); } /** * Returns the ID of the {@link DataStream} in the current {@link StreamExecutionEnvironment}. * * @return ID of the DataStream */ public Integer getId() { return transformation.getId(); } /** * Gets the parallelism for this operator. * * @return The parallelism set for this operator. */ public int getParallelism() { return transformation.getParallelism(); } /** * Gets the type of the stream. * * @return The type of the datastream. */ public TypeInformation getType() { return transformation.getOutputType(); } /** * Invokes the {@link org.apache.flink.api.java.ClosureCleaner} * on the given function if closure cleaning is enabled in the {@link ExecutionConfig}. * * @return The cleaned Function */ protected F clean(F f) { return getExecutionEnvironment().clean(f); } /** * Returns the {@link StreamExecutionEnvironment} that was used to create this * {@link DataStream} * * @return The Execution Environment */ public StreamExecutionEnvironment getExecutionEnvironment() { return environment; } public ExecutionConfig getExecutionConfig() { return environment.getConfig(); } /** * Creates a new {@link DataStream} by merging {@link DataStream} outputs of * the same type with each other. The DataStreams merged using this operator * will be transformed simultaneously. * * @param streams * The DataStreams to union output with. * @return The {@link DataStream}. */ @SafeVarargs public final DataStream union(DataStream... streams) { List> unionedTransforms = new ArrayList<>(); unionedTransforms.add(this.transformation); Collection> thisPredecessors = this.getTransformation().getTransitivePredecessors(); for (DataStream newStream : streams) { if (!(newStream.getParallelism() == this.getParallelism())) { throw new UnsupportedClassVersionError( "DataStream can only be unioned with DataStreams of the same parallelism. " + "This Stream: " + this.getTransformation() + ", other stream: " + newStream.getTransformation()); } if (!getType().equals(newStream.getType())) { throw new IllegalArgumentException("Cannot union streams of different types: " + getType() + " and " + newStream.getType()); } Collection> predecessors = newStream.getTransformation().getTransitivePredecessors(); if (predecessors.contains(this.transformation) || thisPredecessors.contains(newStream.getTransformation())) { throw new UnsupportedOperationException("A DataStream cannot be unioned with itself"); } unionedTransforms.add(newStream.getTransformation()); } return new DataStream(this.environment, new UnionTransformation(unionedTransforms)); } /** * Operator used for directing tuples to specific named outputs using an * {@link org.apache.flink.streaming.api.collector.selector.OutputSelector}. * Calling this method on an operator creates a new {@link SplitStream}. * * @param outputSelector * The user defined * {@link org.apache.flink.streaming.api.collector.selector.OutputSelector} * for directing the tuples. * @return The {@link SplitStream} */ public SplitStream split(OutputSelector outputSelector) { return new SplitStream(this, clean(outputSelector)); } /** * Creates a new {@link ConnectedStreams} by connecting * {@link DataStream} outputs of (possible) different types with each other. * The DataStreams connected using this operator can be used with * CoFunctions to apply joint transformations. * * @param dataStream * The DataStream with which this stream will be connected. * @return The {@link ConnectedStreams}. */ public ConnectedStreams connect(DataStream dataStream) { return new ConnectedStreams(environment, this, dataStream); } /** * * It creates a new {@link KeyedStream} that uses the provided key for partitioning * its operator states. * * @param key * The KeySelector to be used for extracting the key for partitioning * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) */ public KeyedStream keyBy(KeySelector key) { return new KeyedStream(this, clean(key)); } /** * Partitions the operator state of a {@link DataStream} by the given key positions. * * @param fields * The position of the fields on which the {@link DataStream} * will be grouped. * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) */ public KeyedStream keyBy(int... fields) { if (getType() instanceof BasicArrayTypeInfo || getType() instanceof PrimitiveArrayTypeInfo) { return keyBy(KeySelectorUtil.getSelectorForArray(fields, getType())); } else { return keyBy(new Keys.ExpressionKeys(fields, getType())); } } /** * Partitions the operator state of a {@link DataStream}using field expressions. * A field expression is either the name of a public field or a getter method with parentheses * of the {@link DataStream}S underlying type. A dot can be used to drill * down into objects, as in {@code "field1.getInnerField2()" }. * * @param fields * One or more field expressions on which the state of the {@link DataStream} operators will be * partitioned. * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) **/ public KeyedStream keyBy(String... fields) { return keyBy(new Keys.ExpressionKeys(fields, getType())); } private KeyedStream keyBy(Keys keys) { return new KeyedStream(this, clean(KeySelectorUtil.getSelectorForKeys(keys, getType(), getExecutionConfig()))); } /** * Sets the partitioning of the {@link DataStream} so that the output is * partitioned hashing on the given fields. This setting only * effects the how the outputs will be distributed between the parallel * instances of the next processing operator. * * @param fields The tuple fields that should be used for partitioning * @return The partitioned DataStream * */ public DataStream partitionByHash(int... fields) { if (getType() instanceof BasicArrayTypeInfo || getType() instanceof PrimitiveArrayTypeInfo) { return partitionByHash(KeySelectorUtil.getSelectorForArray(fields, getType())); } else { return partitionByHash(new Keys.ExpressionKeys(fields, getType())); } } /** * Sets the partitioning of the {@link DataStream} so that the output is * partitioned hashing on the given fields. This setting only * effects the how the outputs will be distributed between the parallel * instances of the next processing operator. * * @param fields The tuple fields that should be used for partitioning * @return The partitioned DataStream * */ public DataStream partitionByHash(String... fields) { return partitionByHash(new Keys.ExpressionKeys(fields, getType())); } /** * Sets the partitioning of the {@link DataStream} so that the output is * partitioned using the given {@link KeySelector}. This setting only * effects the how the outputs will be distributed between the parallel * instances of the next processing operator. * * @param keySelector The function that extracts the key from an element in the Stream * @return The partitioned DataStream */ public DataStream partitionByHash(KeySelector keySelector) { return setConnectionType(new HashPartitioner(clean(keySelector))); } //private helper method for partitioning private DataStream partitionByHash(Keys keys) { KeySelector keySelector = clean(KeySelectorUtil.getSelectorForKeys( keys, getType(), getExecutionConfig())); return setConnectionType(new HashPartitioner(keySelector)); } /** * Partitions a tuple DataStream on the specified key fields using a custom partitioner. * This method takes the key position to partition on, and a partitioner that accepts the key type. *

* Note: This method works only on single field keys. * * @param partitioner The partitioner to assign partitions to keys. * @param field The field index on which the DataStream is to partitioned. * @return The partitioned DataStream. */ public DataStream partitionCustom(Partitioner partitioner, int field) { Keys.ExpressionKeys outExpressionKeys = new Keys.ExpressionKeys(new int[]{field}, getType()); return partitionCustom(partitioner, outExpressionKeys); } /** * Partitions a POJO DataStream on the specified key fields using a custom partitioner. * This method takes the key expression to partition on, and a partitioner that accepts the key type. *

* Note: This method works only on single field keys. * * @param partitioner The partitioner to assign partitions to keys. * @param field The field index on which the DataStream is to partitioned. * @return The partitioned DataStream. */ public DataStream partitionCustom(Partitioner partitioner, String field) { Keys.ExpressionKeys outExpressionKeys = new Keys.ExpressionKeys(new String[]{field}, getType()); return partitionCustom(partitioner, outExpressionKeys); } /** * Partitions a DataStream on the key returned by the selector, using a custom partitioner. * This method takes the key selector to get the key to partition on, and a partitioner that * accepts the key type. *

* Note: This method works only on single field keys, i.e. the selector cannot return tuples * of fields. * * @param partitioner * The partitioner to assign partitions to keys. * @param keySelector * The KeySelector with which the DataStream is partitioned. * @return The partitioned DataStream. * @see KeySelector */ public DataStream partitionCustom(Partitioner partitioner, KeySelector keySelector) { return setConnectionType(new CustomPartitionerWrapper(clean(partitioner), clean(keySelector))); } // private helper method for custom partitioning private DataStream partitionCustom(Partitioner partitioner, Keys keys) { KeySelector keySelector = KeySelectorUtil.getSelectorForOneKey(keys, partitioner, getType(), getExecutionConfig()); return setConnectionType( new CustomPartitionerWrapper( clean(partitioner), clean(keySelector))); } /** * Sets the partitioning of the {@link DataStream} so that the output tuples * are broadcasted to every parallel instance of the next component. * *

* This setting only effects the how the outputs will be distributed between * the parallel instances of the next processing operator. * * @return The DataStream with broadcast partitioning set. */ public DataStream broadcast() { return setConnectionType(new BroadcastPartitioner()); } /** * Sets the partitioning of the {@link DataStream} so that the output tuples * are shuffled uniformly randomly to the next component. * *

* This setting only effects the how the outputs will be distributed between * the parallel instances of the next processing operator. * * @return The DataStream with shuffle partitioning set. */ public DataStream shuffle() { return setConnectionType(new ShufflePartitioner()); } /** * Sets the partitioning of the {@link DataStream} so that the output tuples * are forwarded to the local subtask of the next component (whenever * possible). * *

* This setting only effects the how the outputs will be distributed between * the parallel instances of the next processing operator. * * @return The DataStream with forward partitioning set. */ public DataStream forward() { return setConnectionType(new ForwardPartitioner()); } /** * Sets the partitioning of the {@link DataStream} so that the output tuples * are distributed evenly to instances of the next component in a Round-robin * fashion. * *

* This setting only effects the how the outputs will be distributed between * the parallel instances of the next processing operator. * * @return The DataStream with rebalance partitioning set. */ public DataStream rebalance() { return setConnectionType(new RebalancePartitioner()); } /** * Sets the partitioning of the {@link DataStream} so that the output values * all go to the first instance of the next processing operator. Use this * setting with care since it might cause a serious performance bottleneck * in the application. * * @return The DataStream with shuffle partitioning set. */ public DataStream global() { return setConnectionType(new GlobalPartitioner()); } /** * Initiates an iterative part of the program that feeds back data streams. * The iterative part needs to be closed by calling * {@link IterativeStream#closeWith(DataStream)}. The transformation of * this IterativeStream will be the iteration head. The data stream * given to the {@link IterativeStream#closeWith(DataStream)} method is * the data stream that will be fed back and used as the input for the * iteration head. The user can also use different feedback type than the * input of the iteration and treat the input and feedback streams as a * {@link ConnectedStreams} be calling * {@link IterativeStream#withFeedbackType(TypeInformation)} *

* A common usage pattern for streaming iterations is to use output * splitting to send a part of the closing data stream to the head. Refer to * {@link #split(OutputSelector)} for more information. *

* The iteration edge will be partitioned the same way as the first input of * the iteration head unless it is changed in the * {@link IterativeStream#closeWith(DataStream)} call. *

* By default a DataStream with iteration will never terminate, but the user * can use the maxWaitTime parameter to set a max waiting time for the * iteration head. If no data received in the set time, the stream * terminates. * * @return The iterative data stream created. */ public IterativeStream iterate() { return new IterativeStream(this, 0); } /** * Initiates an iterative part of the program that feeds back data streams. * The iterative part needs to be closed by calling * {@link IterativeStream#closeWith(DataStream)}. The transformation of * this IterativeStream will be the iteration head. The data stream * given to the {@link IterativeStream#closeWith(DataStream)} method is * the data stream that will be fed back and used as the input for the * iteration head. The user can also use different feedback type than the * input of the iteration and treat the input and feedback streams as a * {@link ConnectedStreams} be calling * {@link IterativeStream#withFeedbackType(TypeInformation)} *

* A common usage pattern for streaming iterations is to use output * splitting to send a part of the closing data stream to the head. Refer to * {@link #split(OutputSelector)} for more information. *

* The iteration edge will be partitioned the same way as the first input of * the iteration head unless it is changed in the * {@link IterativeStream#closeWith(DataStream)} call. *

* By default a DataStream with iteration will never terminate, but the user * can use the maxWaitTime parameter to set a max waiting time for the * iteration head. If no data received in the set time, the stream * terminates. * * @param maxWaitTimeMillis * Number of milliseconds to wait between inputs before shutting * down * * @return The iterative data stream created. */ public IterativeStream iterate(long maxWaitTimeMillis) { return new IterativeStream(this, maxWaitTimeMillis); } /** * Applies a Map transformation on a {@link DataStream}. The transformation * calls a {@link MapFunction} for each element of the DataStream. Each * MapFunction call returns exactly one element. The user can also extend * {@link RichMapFunction} to gain access to other features provided by the * {@link org.apache.flink.api.common.functions.RichFunction} interface. * * @param mapper * The MapFunction that is called for each element of the * DataStream. * @param * output type * @return The transformed {@link DataStream}. */ public SingleOutputStreamOperator map(MapFunction mapper) { TypeInformation outType = TypeExtractor.getMapReturnTypes(clean(mapper), getType(), Utils.getCallLocationName(), true); return transform("Map", outType, new StreamMap(clean(mapper))); } /** * Applies a FlatMap transformation on a {@link DataStream}. The * transformation calls a {@link FlatMapFunction} for each element of the * DataStream. Each FlatMapFunction call can return any number of elements * including none. The user can also extend {@link RichFlatMapFunction} to * gain access to other features provided by the * {@link org.apache.flink.api.common.functions.RichFunction} interface. * * @param flatMapper * The FlatMapFunction that is called for each element of the * DataStream * * @param * output type * @return The transformed {@link DataStream}. */ public SingleOutputStreamOperator flatMap(FlatMapFunction flatMapper) { TypeInformation outType = TypeExtractor.getFlatMapReturnTypes(clean(flatMapper), getType(), Utils.getCallLocationName(), true); return transform("Flat Map", outType, new StreamFlatMap(clean(flatMapper))); } /** * Applies a Filter transformation on a {@link DataStream}. The * transformation calls a {@link FilterFunction} for each element of the * DataStream and retains only those element for which the function returns * true. Elements for which the function returns false are filtered. The * user can also extend {@link RichFilterFunction} to gain access to other * features provided by the * {@link org.apache.flink.api.common.functions.RichFunction} interface. * * @param filter * The FilterFunction that is called for each element of the * DataStream. * @return The filtered DataStream. */ public SingleOutputStreamOperator filter(FilterFunction filter) { return transform("Filter", getType(), new StreamFilter(clean(filter))); } /** * Initiates a Project transformation on a {@link Tuple} {@link DataStream}.
* Note: Only Tuple DataStreams can be projected. * *

* The transformation projects each Tuple of the DataSet onto a (sub)set of * fields. * * @param fieldIndexes * The field indexes of the input tuples that are retained. The * order of fields in the output tuple corresponds to the order * of field indexes. * @return The projected DataStream * * @see Tuple * @see DataStream */ public SingleOutputStreamOperator project(int... fieldIndexes) { return new StreamProjection(this, fieldIndexes).projectTupleX(); } /** * Creates a join operation. See {@link CoGroupedStreams} for an example of how the keys * and window can be specified. */ public CoGroupedStreams coGroup(DataStream otherStream) { return new CoGroupedStreams<>(this, otherStream); } /** * Creates a join operation. See {@link JoinedStreams} for an example of how the keys * and window can be specified. */ public JoinedStreams join(DataStream otherStream) { return new JoinedStreams<>(this, otherStream); } /** * Windows this {@code DataStream} into tumbling time windows. * *

* This is a shortcut for either {@code .window(TumblingTimeWindows.of(size))} or * {@code .window(TumblingProcessingTimeWindows.of(size))} depending on the time characteristic * set using * *

* Note: This operation can be inherently non-parallel since all elements have to pass through * the same operator instance. (Only for special cases, such as aligned time windows is * it possible to perform this operation in parallel). * * {@link org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#setStreamTimeCharacteristic(org.apache.flink.streaming.api.TimeCharacteristic)} * * @param size The size of the window. */ public AllWindowedStream timeWindowAll(AbstractTime size) { return windowAll(TumblingTimeWindows.of(size)); } /** * Windows this {@code DataStream} into sliding time windows. * *

* This is a shortcut for either {@code .window(SlidingTimeWindows.of(size, slide))} or * {@code .window(SlidingProcessingTimeWindows.of(size, slide))} depending on the time characteristic * set using * {@link org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#setStreamTimeCharacteristic(org.apache.flink.streaming.api.TimeCharacteristic)} * *

* Note: This operation can be inherently non-parallel since all elements have to pass through * the same operator instance. (Only for special cases, such as aligned time windows is * it possible to perform this operation in parallel). * * @param size The size of the window. */ public AllWindowedStream timeWindowAll(AbstractTime size, AbstractTime slide) { return windowAll(SlidingTimeWindows.of(size, slide)); } /** * Windows this {@code DataStream} into tumbling count windows. * *

* Note: This operation can be inherently non-parallel since all elements have to pass through * the same operator instance. (Only for special cases, such as aligned time windows is * it possible to perform this operation in parallel). * * @param size The size of the windows in number of elements. */ public AllWindowedStream countWindowAll(long size) { return windowAll(GlobalWindows.create()).trigger(PurgingTrigger.of(CountTrigger.of(size))); } /** * Windows this {@code DataStream} into sliding count windows. * *

* Note: This operation can be inherently non-parallel since all elements have to pass through * the same operator instance. (Only for special cases, such as aligned time windows is * it possible to perform this operation in parallel). * * @param size The size of the windows in number of elements. * @param slide The slide interval in number of elements. */ public AllWindowedStream countWindowAll(long size, long slide) { return windowAll(GlobalWindows.create()) .evictor(CountEvictor.of(size)) .trigger(CountTrigger.of(slide)); } /** * Windows this data stream to a {@code KeyedTriggerWindowDataStream}, which evaluates windows * over a key grouped stream. Elements are put into windows by a * {@link org.apache.flink.streaming.api.windowing.assigners.WindowAssigner}. The grouping of * elements is done both by key and by window. * *

* A {@link org.apache.flink.streaming.api.windowing.triggers.Trigger} can be defined to specify * when windows are evaluated. However, {@code WindowAssigners} have a default {@code Trigger} * that is used if a {@code Trigger} is not specified. * *

* Note: This operation can be inherently non-parallel since all elements have to pass through * the same operator instance. (Only for special cases, such as aligned time windows is * it possible to perform this operation in parallel). * * @param assigner The {@code WindowAssigner} that assigns elements to windows. * @return The trigger windows data stream. */ public AllWindowedStream windowAll(WindowAssigner assigner) { return new AllWindowedStream<>(this, assigner); } /** * Extracts a timestamp from an element and assigns it as the internal timestamp of that element. * The internal timestamps are, for example, used to to event-time window operations. * *

* If you know that the timestamps are strictly increasing you can use an * {@link org.apache.flink.streaming.api.functions.AscendingTimestampExtractor}. Otherwise, * you should provide a {@link TimestampExtractor} that also implements * {@link TimestampExtractor#getCurrentWatermark()} to keep track of watermarks. * * @see org.apache.flink.streaming.api.watermark.Watermark * * @param extractor The TimestampExtractor that is called for each element of the DataStream. */ public SingleOutputStreamOperator assignTimestamps(TimestampExtractor extractor) { // match parallelism to input, otherwise dop=1 sources could lead to some strange // behaviour: the watermark will creep along very slowly because the elements // from the source go to each extraction operator round robin. int inputParallelism = getTransformation().getParallelism(); ExtractTimestampsOperator operator = new ExtractTimestampsOperator<>(clean(extractor)); return transform("ExtractTimestamps", getTransformation().getOutputType(), operator) .setParallelism(inputParallelism); } /** * Writes a DataStream to the standard output stream (stdout). * *

* For each element of the DataStream the result of * {@link Object#toString()} is written. * * @return The closed DataStream. */ public DataStreamSink print() { PrintSinkFunction printFunction = new PrintSinkFunction(); return addSink(printFunction); } /** * Writes a DataStream to the standard output stream (stderr). * *

* For each element of the DataStream the result of * {@link Object#toString()} is written. * * @return The closed DataStream. */ public DataStreamSink printToErr() { PrintSinkFunction printFunction = new PrintSinkFunction(true); return addSink(printFunction); } /** * Writes a DataStream to the file specified by path in text format. * *

* For every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path * the path pointing to the location the text file is written to * * @return the closed DataStream. */ public DataStreamSink writeAsText(String path) { return write(new TextOutputFormat(new Path(path)), 0L); } /** * Writes a DataStream to the file specified by path in text format. The * writing is performed periodically, in every millis milliseconds. * *

* For every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path * the path pointing to the location the text file is written to * @param millis * the file update frequency * * @return the closed DataStream */ public DataStreamSink writeAsText(String path, long millis) { TextOutputFormat tof = new TextOutputFormat(new Path(path)); return write(tof, millis); } /** * Writes a DataStream to the file specified by path in text format. * *

* For every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path * the path pointing to the location the text file is written to * @param writeMode * Control the behavior for existing files. Options are * NO_OVERWRITE and OVERWRITE. * * @return the closed DataStream. */ public DataStreamSink writeAsText(String path, WriteMode writeMode) { TextOutputFormat tof = new TextOutputFormat(new Path(path)); tof.setWriteMode(writeMode); return write(tof, 0L); } /** * Writes a DataStream to the file specified by path in text format. * *

* For every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path * the path pointing to the location the text file is written to * @param writeMode * Controls the behavior for existing files. Options are * NO_OVERWRITE and OVERWRITE. * @param millis * the file update frequency * * @return the closed DataStream. */ public DataStreamSink writeAsText(String path, WriteMode writeMode, long millis) { TextOutputFormat tof = new TextOutputFormat(new Path(path)); tof.setWriteMode(writeMode); return write(tof, millis); } /** * Writes a DataStream to the file specified by path in csv format. * *

* For every field of an element of the DataStream the result of {@link Object#toString()} * is written. This method can only be used on data streams of tuples. * * @param path * the path pointing to the location the text file is written to * * @return the closed DataStream */ @SuppressWarnings("unchecked") public DataStreamSink writeAsCsv(String path) { Preconditions.checkArgument(getType().isTupleType(), "The writeAsCsv() method can only be used on data sets of tuples."); CsvOutputFormat of = new CsvOutputFormat(new Path(path), CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER); return write((OutputFormat) of, 0L); } /** * Writes a DataStream to the file specified by path in csv format. The * writing is performed periodically, in every millis milliseconds. * *

* For every field of an element of the DataStream the result of {@link Object#toString()} * is written. This method can only be used on data streams of tuples. * * @param path * the path pointing to the location the text file is written to * @param millis * the file update frequency * * @return the closed DataStream */ @SuppressWarnings("unchecked") public DataStreamSink writeAsCsv(String path, long millis) { Preconditions.checkArgument(getType().isTupleType(), "The writeAsCsv() method can only be used on data sets of tuples."); CsvOutputFormat of = new CsvOutputFormat(new Path(path), CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER); return write((OutputFormat) of, millis); } /** * Writes a DataStream to the file specified by path in csv format. * *

* For every field of an element of the DataStream the result of {@link Object#toString()} * is written. This method can only be used on data streams of tuples. * * @param path * the path pointing to the location the text file is written to * @param writeMode * Controls the behavior for existing files. Options are * NO_OVERWRITE and OVERWRITE. * * @return the closed DataStream */ @SuppressWarnings("unchecked") public DataStreamSink writeAsCsv(String path, WriteMode writeMode) { Preconditions.checkArgument(getType().isTupleType(), "The writeAsCsv() method can only be used on data sets of tuples."); CsvOutputFormat of = new CsvOutputFormat(new Path(path), CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER); if (writeMode != null) { of.setWriteMode(writeMode); } return write((OutputFormat) of, 0L); } /** * Writes a DataStream to the file specified by path in csv format. The * writing is performed periodically, in every millis milliseconds. * *

* For every field of an element of the DataStream the result of {@link Object#toString()} * is written. This method can only be used on data streams of tuples. * * @param path * the path pointing to the location the text file is written to * @param writeMode * Controls the behavior for existing files. Options are * NO_OVERWRITE and OVERWRITE. * @param millis * the file update frequency * * @return the closed DataStream */ @SuppressWarnings("unchecked") public DataStreamSink writeAsCsv(String path, WriteMode writeMode, long millis) { Preconditions.checkArgument(getType().isTupleType(), "The writeAsCsv() method can only be used on data sets of tuples."); CsvOutputFormat of = new CsvOutputFormat(new Path(path), CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER); if (writeMode != null) { of.setWriteMode(writeMode); } return write((OutputFormat) of, millis); } /** * Writes the DataStream to a socket as a byte array. The format of the * output is specified by a {@link SerializationSchema}. * * @param hostName * host of the socket * @param port * port of the socket * @param schema * schema for serialization * @return the closed DataStream */ public DataStreamSink writeToSocket(String hostName, int port, SerializationSchema schema) { DataStreamSink returnStream = addSink(new SocketClientSink(hostName, port, schema, 0)); returnStream.setParallelism(1); // It would not work if multiple instances would connect to the same port return returnStream; } /** * Writes the dataStream into an output, described by an OutputFormat. * * @param format The output format * @param millis the write frequency * @return The closed DataStream */ public DataStreamSink write(OutputFormat format, long millis) { return addSink(new FileSinkFunctionByMillis(format, millis)); } /** * Method for passing user defined operators along with the type * information that will transform the DataStream. * * @param operatorName * name of the operator, for logging purposes * @param outTypeInfo * the output type of the operator * @param operator * the object containing the transformation logic * @param * type of the return stream * @return the data stream constructed */ public SingleOutputStreamOperator transform(String operatorName, TypeInformation outTypeInfo, OneInputStreamOperator operator) { // read the output type of the input Transform to coax out errors about MissingTypeInfo transformation.getOutputType(); OneInputTransformation resultTransform = new OneInputTransformation( this.transformation, operatorName, operator, outTypeInfo, environment.getParallelism()); @SuppressWarnings({ "unchecked", "rawtypes" }) SingleOutputStreamOperator returnStream = new SingleOutputStreamOperator(environment, resultTransform); getExecutionEnvironment().addOperator(resultTransform); return returnStream; } /** * Internal function for setting the partitioner for the DataStream * * @param partitioner * Partitioner to set. * @return The modified DataStream. */ protected DataStream setConnectionType(StreamPartitioner partitioner) { return new DataStream(this.getExecutionEnvironment(), new PartitionTransformation(this.getTransformation(), partitioner)); } /** * Adds the given sink to this DataStream. Only streams with sinks added * will be executed once the {@link StreamExecutionEnvironment#execute()} * method is called. * * @param sinkFunction * The object containing the sink's invoke function. * @return The closed DataStream. */ public DataStreamSink addSink(SinkFunction sinkFunction) { // read the output type of the input Transform to coax out errors about MissingTypeInfo transformation.getOutputType(); // configure the type if needed if (sinkFunction instanceof InputTypeConfigurable) { ((InputTypeConfigurable) sinkFunction).setInputType(getType(), getExecutionConfig() ); } StreamSink sinkOperator = new StreamSink(clean(sinkFunction)); DataStreamSink sink = new DataStreamSink(this, sinkOperator); getExecutionEnvironment().addOperator(sink.getTransformation()); return sink; } /** * Returns the {@link StreamTransformation} that represents the operation that logically creates * this {@link DataStream}. * * @return The Transformation */ public StreamTransformation getTransformation() { return transformation; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy