org.apache.flink.api.java.DataSet Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of flink-java Show documentation
There is a newer version: 1.20.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.api.java;

import org.apache.flink.annotation.Public;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.accumulators.SerializedListAccumulator;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.GroupCombineFunction;
import org.apache.flink.api.common.functions.GroupReduceFunction;
import org.apache.flink.api.common.functions.InvalidTypesException;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.MapPartitionFunction;
import org.apache.flink.api.common.functions.Partitioner;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.io.FileOutputFormat;
import org.apache.flink.api.common.io.OutputFormat;
import org.apache.flink.api.common.operators.Keys;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.operators.base.CrossOperatorBase.CrossHint;
import org.apache.flink.api.common.operators.base.JoinOperatorBase.JoinHint;
import org.apache.flink.api.common.operators.base.PartitionOperatorBase.PartitionMethod;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.java.aggregation.Aggregations;
import org.apache.flink.api.java.functions.FirstReducer;
import org.apache.flink.api.java.functions.FormattingMapper;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.functions.SelectByMaxFunction;
import org.apache.flink.api.java.functions.SelectByMinFunction;
import org.apache.flink.api.java.io.CsvOutputFormat;
import org.apache.flink.api.java.io.PrintingOutputFormat;
import org.apache.flink.api.java.io.TextOutputFormat;
import org.apache.flink.api.java.io.TextOutputFormat.TextFormatter;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.CoGroupOperator;
import org.apache.flink.api.java.operators.CoGroupOperator.CoGroupOperatorSets;
import org.apache.flink.api.java.operators.CrossOperator;
import org.apache.flink.api.java.operators.CustomUnaryOperation;
import org.apache.flink.api.java.operators.DataSink;
import org.apache.flink.api.java.operators.DeltaIteration;
import org.apache.flink.api.java.operators.DistinctOperator;
import org.apache.flink.api.java.operators.FilterOperator;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.operators.GroupCombineOperator;
import org.apache.flink.api.java.operators.GroupReduceOperator;
import org.apache.flink.api.java.operators.IterativeDataSet;
import org.apache.flink.api.java.operators.JoinOperator.JoinOperatorSets;
import org.apache.flink.api.java.operators.MapOperator;
import org.apache.flink.api.java.operators.MapPartitionOperator;
import org.apache.flink.api.java.operators.PartitionOperator;
import org.apache.flink.api.java.operators.ProjectOperator;
import org.apache.flink.api.java.operators.ProjectOperator.Projection;
import org.apache.flink.api.java.operators.ReduceOperator;
import org.apache.flink.api.java.operators.SortPartitionOperator;
import org.apache.flink.api.java.operators.SortedGrouping;
import org.apache.flink.api.java.operators.UnionOperator;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.operators.join.JoinOperatorSetsBase;
import org.apache.flink.api.java.operators.join.JoinType;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.InputTypeConfigurable;
import org.apache.flink.api.java.typeutils.MissingTypeInfo;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.core.fs.FileSystem.WriteMode;
import org.apache.flink.core.fs.Path;
import org.apache.flink.util.AbstractID;
import org.apache.flink.util.Preconditions;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * A DataSet represents a collection of elements of the same type.
 *
 * A DataSet can be transformed into another DataSet by applying a transformation as for example
 *
 * 

 *   {@link DataSet#map(org.apache.flink.api.common.functions.MapFunction)},
 *   
{@link DataSet#reduce(org.apache.flink.api.common.functions.ReduceFunction)},
 *   
{@link DataSet#join(DataSet)}, or
 *   
{@link DataSet#coGroup(DataSet)}.
 * 
 *
 * @param  The type of the DataSet, i.e., the type of the elements of the DataSet.
 */
@Public
public abstract class DataSet {

    protected final ExecutionEnvironment context;

    // NOTE: the type must not be accessed directly, but only via getType()
    private TypeInformation type;

    private boolean typeUsed = false;

    protected DataSet(ExecutionEnvironment context, TypeInformation typeInfo) {
        if (context == null) {
            throw new NullPointerException("context is null");
        }
        if (typeInfo == null) {
            throw new NullPointerException("typeInfo is null");
        }

        this.context = context;
        this.type = typeInfo;
    }

    /**
     * Returns the {@link ExecutionEnvironment} in which this DataSet is registered.
     *
     * @return The ExecutionEnvironment in which this DataSet is registered.
     * @see ExecutionEnvironment
     */
    public ExecutionEnvironment getExecutionEnvironment() {
        return this.context;
    }

    // --------------------------------------------------------------------------------------------
    //  Type Information handling
    // --------------------------------------------------------------------------------------------

    /**
     * Tries to fill in the type information. Type information can be filled in later when the
     * program uses a type hint. This method checks whether the type information has ever been
     * accessed before and does not allow modifications if the type was accessed already. This
     * ensures consistency by making sure different parts of the operation do not assume different
     * type information.
     *
     * @param typeInfo The type information to fill in.
     * @throws IllegalStateException Thrown, if the type information has been accessed before.
     */
    protected void fillInType(TypeInformation typeInfo) {
        if (typeUsed) {
            throw new IllegalStateException(
                    "TypeInformation cannot be filled in for the type after it has been used. "
                            + "Please make sure that the type info hints are the first call after the transformation function, "
                            + "before any access to types or semantic properties, etc.");
        }
        this.type = typeInfo;
    }

    /**
     * Returns the {@link TypeInformation} for the type of this DataSet.
     *
     * @return The TypeInformation for the type of this DataSet.
     * @see TypeInformation
     */
    public TypeInformation getType() {
        if (type instanceof MissingTypeInfo) {
            MissingTypeInfo typeInfo = (MissingTypeInfo) type;
            throw new InvalidTypesException(
                    "The return type of function '"
                            + typeInfo.getFunctionName()
                            + "' could not be determined automatically, due to type erasure. "
                            + "You can give type information hints by using the returns(...) method on the result of "
                            + "the transformation call, or by letting your function implement the 'ResultTypeQueryable' "
                            + "interface.",
                    typeInfo.getTypeException());
        }
        typeUsed = true;
        return this.type;
    }

    public  F clean(F f) {
        if (getExecutionEnvironment().getConfig().isClosureCleanerEnabled()) {
            ClosureCleaner.clean(
                    f, getExecutionEnvironment().getConfig().getClosureCleanerLevel(), true);
        } else {
            ClosureCleaner.ensureSerializable(f);
        }
        return f;
    }

    // --------------------------------------------------------------------------------------------
    //  Filter & Transformations
    // --------------------------------------------------------------------------------------------

    /**
     * Applies a Map transformation on this DataSet.
     *
     * The transformation calls a {@link org.apache.flink.api.common.functions.MapFunction} for
     * each element of the DataSet. Each MapFunction call returns exactly one element.
     *
     * @param mapper The MapFunction that is called for each element of the DataSet.
     * @return A MapOperator that represents the transformed DataSet.
     * @see org.apache.flink.api.common.functions.MapFunction
     * @see org.apache.flink.api.common.functions.RichMapFunction
     * @see MapOperator
     */
    public  MapOperator map(MapFunction mapper) {
        if (mapper == null) {
            throw new NullPointerException("Map function must not be null.");
        }

        String callLocation = Utils.getCallLocationName();
        TypeInformation resultType =
                TypeExtractor.getMapReturnTypes(mapper, getType(), callLocation, true);
        return new MapOperator<>(this, resultType, clean(mapper), callLocation);
    }

    /**
     * Applies a Map-style operation to the entire partition of the data. The function is called
     * once per parallel partition of the data, and the entire partition is available through the
     * given Iterator. The number of elements that each instance of the MapPartition function sees
     * is non deterministic and depends on the parallelism of the operation.
     *
     * 
This function is intended for operations that cannot transform individual elements,
     * requires no grouping of elements. To transform individual elements, the use of {@code map()}
     * and {@code flatMap()} is preferable.
     *
     * @param mapPartition The MapPartitionFunction that is called for the full DataSet.
     * @return A MapPartitionOperator that represents the transformed DataSet.
     * @see MapPartitionFunction
     * @see MapPartitionOperator
     */
    public  MapPartitionOperator mapPartition(MapPartitionFunction mapPartition) {
        if (mapPartition == null) {
            throw new NullPointerException("MapPartition function must not be null.");
        }

        String callLocation = Utils.getCallLocationName();
        TypeInformation resultType =
                TypeExtractor.getMapPartitionReturnTypes(
                        mapPartition, getType(), callLocation, true);
        return new MapPartitionOperator<>(this, resultType, clean(mapPartition), callLocation);
    }

    /**
     * Applies a FlatMap transformation on a {@link DataSet}.
     *
     * 
The transformation calls a {@link
     * org.apache.flink.api.common.functions.RichFlatMapFunction} for each element of the DataSet.
     * Each FlatMapFunction call can return any number of elements including none.
     *
     * @param flatMapper The FlatMapFunction that is called for each element of the DataSet.
     * @return A FlatMapOperator that represents the transformed DataSet.
     * @see org.apache.flink.api.common.functions.RichFlatMapFunction
     * @see FlatMapOperator
     * @see DataSet
     */
    public  FlatMapOperator flatMap(FlatMapFunction flatMapper) {
        if (flatMapper == null) {
            throw new NullPointerException("FlatMap function must not be null.");
        }

        String callLocation = Utils.getCallLocationName();
        TypeInformation resultType =
                TypeExtractor.getFlatMapReturnTypes(flatMapper, getType(), callLocation, true);
        return new FlatMapOperator<>(this, resultType, clean(flatMapper), callLocation);
    }

    /**
     * Applies a Filter transformation on a {@link DataSet}.
     *
     * 
The transformation calls a {@link
     * org.apache.flink.api.common.functions.RichFilterFunction} for each element of the DataSet and
     * retains only those element for which the function returns true. Elements for which the
     * function returns false are filtered.
     *
     * @param filter The FilterFunction that is called for each element of the DataSet.
     * @return A FilterOperator that represents the filtered DataSet.
     * @see org.apache.flink.api.common.functions.RichFilterFunction
     * @see FilterOperator
     * @see DataSet
     */
    public FilterOperator filter(FilterFunction filter) {
        if (filter == null) {
            throw new NullPointerException("Filter function must not be null.");
        }
        return new FilterOperator<>(this, clean(filter), Utils.getCallLocationName());
    }

    // --------------------------------------------------------------------------------------------
    //  Projections
    // --------------------------------------------------------------------------------------------

    /**
     * Applies a Project transformation on a {@link Tuple} {@link DataSet}.
     *
     * 
Note: Only Tuple DataSets can be projected using field indexes.
     *
     * 
The transformation projects each Tuple of the DataSet onto a (sub)set of fields.
     *
     * 
Additional fields can be added to the projection by calling {@link
     * ProjectOperator#project(int[])}.
     *
     * 
Note: With the current implementation, the Project transformation looses type
     * information.
     *
     * @param fieldIndexes The field indexes of the input tuple that are retained. The order of
     *     fields in the output tuple corresponds to the order of field indexes.
     * @return A ProjectOperator that represents the projected DataSet.
     * @see Tuple
     * @see DataSet
     * @see ProjectOperator
     */
    public  ProjectOperator project(int... fieldIndexes) {
        return new Projection<>(this, fieldIndexes).projectTupleX();
    }

    // --------------------------------------------------------------------------------------------
    //  Non-grouped aggregations
    // --------------------------------------------------------------------------------------------

    /**
     * Applies an Aggregate transformation on a non-grouped {@link Tuple} {@link DataSet}.
     *
     * 
Note: Only Tuple DataSets can be aggregated. The transformation applies a built-in
     * {@link Aggregations Aggregation} on a specified field of a Tuple DataSet. Additional
     * aggregation functions can be added to the resulting {@link AggregateOperator} by calling
     * {@link AggregateOperator#and(Aggregations, int)}.
     *
     * @param agg The built-in aggregation function that is computed.
     * @param field The index of the Tuple field on which the aggregation function is applied.
     * @return An AggregateOperator that represents the aggregated DataSet.
     * @see Tuple
     * @see Aggregations
     * @see AggregateOperator
     * @see DataSet
     */
    public AggregateOperator aggregate(Aggregations agg, int field) {
        return new AggregateOperator<>(this, agg, field, Utils.getCallLocationName());
    }

    /**
     * Syntactic sugar for aggregate (SUM, field).
     *
     * @param field The index of the Tuple field on which the aggregation function is applied.
     * @return An AggregateOperator that represents the summed DataSet.
     * @see org.apache.flink.api.java.operators.AggregateOperator
     */
    public AggregateOperator sum(int field) {
        return aggregate(Aggregations.SUM, field);
    }

    /**
     * Syntactic sugar for {@link #aggregate(Aggregations, int)} using {@link Aggregations#MAX} as
     * the aggregation function.
     *
     * 
Note: This operation is not to be confused with {@link #maxBy(int...)},
     * which selects one element with maximum value at the specified field positions.
     *
     * @param field The index of the Tuple field on which the aggregation function is applied.
     * @return An AggregateOperator that represents the max'ed DataSet.
     * @see #aggregate(Aggregations, int)
     * @see #maxBy(int...)
     */
    public AggregateOperator max(int field) {
        return aggregate(Aggregations.MAX, field);
    }

    /**
     * Syntactic sugar for {@link #aggregate(Aggregations, int)} using {@link Aggregations#MIN} as
     * the aggregation function.
     *
     * 
Note: This operation is not to be confused with {@link #minBy(int...)},
     * which selects one element with the minimum value at the specified field positions.
     *
     * @param field The index of the Tuple field on which the aggregation function is applied.
     * @return An AggregateOperator that represents the min'ed DataSet.
     * @see #aggregate(Aggregations, int)
     * @see #minBy(int...)
     */
    public AggregateOperator min(int field) {
        return aggregate(Aggregations.MIN, field);
    }

    /**
     * Convenience method to get the count (number of elements) of a DataSet.
     *
     * @return A long integer that represents the number of elements in the data set.
     */
    public long count() throws Exception {
        final String id = new AbstractID().toString();

        output(new Utils.CountHelper(id)).name("count()");

        JobExecutionResult res = getExecutionEnvironment().execute();
        return res.getAccumulatorResult(id);
    }

    /**
     * Convenience method to get the elements of a DataSet as a List. As DataSet can contain a lot
     * of data, this method should be used with caution.
     *
     * @return A List containing the elements of the DataSet
     */
    public List collect() throws Exception {
        final String id = new AbstractID().toString();
        final TypeSerializer serializer =
                getType().createSerializer(getExecutionEnvironment().getConfig());

        this.output(new Utils.CollectHelper<>(id, serializer)).name("collect()");
        JobExecutionResult res = getExecutionEnvironment().execute();

        ArrayList accResult = res.getAccumulatorResult(id);
        if (accResult != null) {
            try {
                return SerializedListAccumulator.deserializeList(accResult, serializer);
            } catch (ClassNotFoundException e) {
                throw new RuntimeException("Cannot find type class of collected data type.", e);
            } catch (IOException e) {
                throw new RuntimeException(
                        "Serialization error while deserializing collected data", e);
            }
        } else {
            throw new RuntimeException("The call to collect() could not retrieve the DataSet.");
        }
    }

    /**
     * Applies a Reduce transformation on a non-grouped {@link DataSet}.
     *
     * 
The transformation consecutively calls a {@link
     * org.apache.flink.api.common.functions.RichReduceFunction} until only a single element remains
     * which is the result of the transformation. A ReduceFunction combines two elements into one
     * new element of the same type.
     *
     * @param reducer The ReduceFunction that is applied on the DataSet.
     * @return A ReduceOperator that represents the reduced DataSet.
     * @see org.apache.flink.api.common.functions.RichReduceFunction
     * @see ReduceOperator
     * @see DataSet
     */
    public ReduceOperator reduce(ReduceFunction reducer) {
        if (reducer == null) {
            throw new NullPointerException("Reduce function must not be null.");
        }
        return new ReduceOperator<>(this, clean(reducer), Utils.getCallLocationName());
    }

    /**
     * Applies a GroupReduce transformation on a non-grouped {@link DataSet}.
     *
     * 
The transformation calls a {@link
     * org.apache.flink.api.common.functions.RichGroupReduceFunction} once with the full DataSet.
     * The GroupReduceFunction can iterate over all elements of the DataSet and emit any number of
     * output elements including none.
     *
     * @param reducer The GroupReduceFunction that is applied on the DataSet.
     * @return A GroupReduceOperator that represents the reduced DataSet.
     * @see org.apache.flink.api.common.functions.RichGroupReduceFunction
     * @see org.apache.flink.api.java.operators.GroupReduceOperator
     * @see DataSet
     */
    public  GroupReduceOperator reduceGroup(GroupReduceFunction reducer) {
        if (reducer == null) {
            throw new NullPointerException("GroupReduce function must not be null.");
        }

        String callLocation = Utils.getCallLocationName();
        TypeInformation resultType =
                TypeExtractor.getGroupReduceReturnTypes(reducer, getType(), callLocation, true);
        return new GroupReduceOperator<>(this, resultType, clean(reducer), callLocation);
    }

    /**
     * Applies a GroupCombineFunction on a non-grouped {@link DataSet}. A CombineFunction is similar
     * to a GroupReduceFunction but does not perform a full data exchange. Instead, the
     * CombineFunction calls the combine method once per partition for combining a group of results.
     * This operator is suitable for combining values into an intermediate format before doing a
     * proper groupReduce where the data is shuffled across the node for further reduction. The
     * GroupReduce operator can also be supplied with a combiner by implementing the RichGroupReduce
     * function. The combine method of the RichGroupReduce function demands input and output type to
     * be the same. The CombineFunction, on the other side, can have an arbitrary output type.
     *
     * @param combiner The GroupCombineFunction that is applied on the DataSet.
     * @return A GroupCombineOperator which represents the combined DataSet.
     */
    public  GroupCombineOperator combineGroup(GroupCombineFunction combiner) {
        if (combiner == null) {
            throw new NullPointerException("GroupCombine function must not be null.");
        }

        String callLocation = Utils.getCallLocationName();
        TypeInformation resultType =
                TypeExtractor.getGroupCombineReturnTypes(combiner, getType(), callLocation, true);
        return new GroupCombineOperator<>(this, resultType, clean(combiner), callLocation);
    }

    /**
     * Selects an element with minimum value.
     *
     * 
The minimum is computed over the specified fields in lexicographical order.
     *
     * 
Example 1: Given a data set with elements [0, 1], [1, 0],
     * the results will be:
     *
     * 

     *   minBy(0): [0, 1]
     *   
minBy(1): [1, 0]
     * 
     *
     * Example 2: Given a data set with elements [0, 0], [0, 1],
     * the results will be:
     *
     * 

     *   minBy(0, 1): [0, 0]
     * 
     *
     * If multiple values with minimum value at the specified fields exist, a random one will be
     * picked.
     *
     * 
Internally, this operation is implemented as a {@link ReduceFunction}.
     *
     * @param fields Field positions to compute the minimum over
     * @return A {@link ReduceOperator} representing the minimum
     */
    @SuppressWarnings({"unchecked", "rawtypes"})
    public ReduceOperator minBy(int... fields) {
        if (!getType().isTupleType() || !(getType() instanceof TupleTypeInfo)) {
            throw new InvalidProgramException("DataSet#minBy(int...) only works on Tuple types.");
        }

        return new ReduceOperator<>(
                this,
                new SelectByMinFunction((TupleTypeInfo) getType(), fields),
                Utils.getCallLocationName());
    }

    /**
     * Selects an element with maximum value.
     *
     * 
The maximum is computed over the specified fields in lexicographical order.
     *
     * 
Example 1: Given a data set with elements [0, 1], [1, 0],
     * the results will be:
     *
     * 

     *   maxBy(0): [1, 0]
     *   
maxBy(1): [0, 1]
     * 
     *
     * Example 2: Given a data set with elements [0, 0], [0, 1],
     * the results will be:
     *
     * 

     *   maxBy(0, 1): [0, 1]
     * 
     *
     * If multiple values with maximum value at the specified fields exist, a random one will be
     * picked.
     *
     * 
Internally, this operation is implemented as a {@link ReduceFunction}.
     *
     * @param fields Field positions to compute the maximum over
     * @return A {@link ReduceOperator} representing the maximum
     */
    @SuppressWarnings({"unchecked", "rawtypes"})
    public ReduceOperator maxBy(int... fields) {
        if (!getType().isTupleType() || !(getType() instanceof TupleTypeInfo)) {
            throw new InvalidProgramException("DataSet#maxBy(int...) only works on Tuple types.");
        }

        return new ReduceOperator<>(
                this,
                new SelectByMaxFunction((TupleTypeInfo) getType(), fields),
                Utils.getCallLocationName());
    }

    /**
     * Returns a new set containing the first n elements in this {@link DataSet}.
     *
     * @param n The desired number of elements.
     * @return A ReduceGroupOperator that represents the DataSet containing the elements.
     */
    public GroupReduceOperator first(int n) {
        if (n < 1) {
            throw new InvalidProgramException("Parameter n of first(n) must be at least 1.");
        }

        return reduceGroup(new FirstReducer(n));
    }

    // --------------------------------------------------------------------------------------------
    //  distinct
    // --------------------------------------------------------------------------------------------

    /**
     * Returns a distinct set of a {@link DataSet} using a {@link KeySelector} function.
     *
     * 
The KeySelector function is called for each element of the DataSet and extracts a single
     * key value on which the decision is made if two items are distinct or not.
     *
     * @param keyExtractor The KeySelector function which extracts the key values from the DataSet
     *     on which the distinction of the DataSet is decided.
     * @return A DistinctOperator that represents the distinct DataSet.
     */
    public  DistinctOperator distinct(KeySelector keyExtractor) {
        TypeInformation keyType = TypeExtractor.getKeySelectorTypes(keyExtractor, getType());
        return new DistinctOperator<>(
                this,
                new Keys.SelectorFunctionKeys<>(keyExtractor, getType(), keyType),
                Utils.getCallLocationName());
    }

    /**
     * Returns a distinct set of a {@link Tuple} {@link DataSet} using field position keys.
     *
     * 
The field position keys specify the fields of Tuples on which the decision is made if two
     * Tuples are distinct or not.
     *
     * 
Note: Field position keys can only be specified for Tuple DataSets.
     *
     * @param fields One or more field positions on which the distinction of the DataSet is decided.
     * @return A DistinctOperator that represents the distinct DataSet.
     */
    public DistinctOperator distinct(int... fields) {
        return new DistinctOperator<>(
                this, new Keys.ExpressionKeys<>(fields, getType()), Utils.getCallLocationName());
    }

    /**
     * Returns a distinct set of a {@link DataSet} using expression keys.
     *
     * 
The field expression keys specify the fields of a {@link
     * org.apache.flink.api.common.typeutils.CompositeType} (e.g., Tuple or Pojo type) on which the
     * decision is made if two elements are distinct or not. In case of a {@link
     * org.apache.flink.api.common.typeinfo.AtomicType}, only the wildcard expression ("*") is
     * valid.
     *
     * @param fields One or more field expressions on which the distinction of the DataSet is
     *     decided.
     * @return A DistinctOperator that represents the distinct DataSet.
     */
    public DistinctOperator distinct(String... fields) {
        return new DistinctOperator<>(
                this, new Keys.ExpressionKeys<>(fields, getType()), Utils.getCallLocationName());
    }

    /**
     * Returns a distinct set of a {@link DataSet}.
     *
     * 
If the input is a {@link org.apache.flink.api.common.typeutils.CompositeType} (Tuple or
     * Pojo type), distinct is performed on all fields and each field must be a key type
     *
     * @return A DistinctOperator that represents the distinct DataSet.
     */
    public DistinctOperator distinct() {
        return new DistinctOperator<>(this, null, Utils.getCallLocationName());
    }

    // --------------------------------------------------------------------------------------------
    //  Grouping
    // --------------------------------------------------------------------------------------------

    /**
     * Groups a {@link DataSet} using a {@link KeySelector} function. The KeySelector function is
     * called for each element of the DataSet and extracts a single key value on which the DataSet
     * is grouped.
     *
     * 
This method returns an {@link UnsortedGrouping} on which one of the following grouping
     * transformation can be applied.
     *
     * 

     *   {@link UnsortedGrouping#sortGroup(int, org.apache.flink.api.common.operators.Order)} to
     *       get a {@link SortedGrouping}.
     *   
{@link UnsortedGrouping#aggregate(Aggregations, int)} to apply an Aggregate
     *       transformation.
     *   
{@link UnsortedGrouping#reduce(org.apache.flink.api.common.functions.ReduceFunction)}
     *       to apply a Reduce transformation.
     *   
{@link
     *       UnsortedGrouping#reduceGroup(org.apache.flink.api.common.functions.GroupReduceFunction)}
     *       to apply a GroupReduce transformation.
     * 
     *
     * @param keyExtractor The KeySelector function which extracts the key values from the DataSet
     *     on which it is grouped.
     * @return An UnsortedGrouping on which a transformation needs to be applied to obtain a
     *     transformed DataSet.
     * @see KeySelector
     * @see UnsortedGrouping
     * @see AggregateOperator
     * @see ReduceOperator
     * @see org.apache.flink.api.java.operators.GroupReduceOperator
     * @see DataSet
     */
    public  UnsortedGrouping groupBy(KeySelector keyExtractor) {
        TypeInformation keyType = TypeExtractor.getKeySelectorTypes(keyExtractor, getType());
        return new UnsortedGrouping<>(
                this, new Keys.SelectorFunctionKeys<>(clean(keyExtractor), getType(), keyType));
    }

    /**
     * Groups a {@link Tuple} {@link DataSet} using field position keys.
     *
     * Note: Field position keys only be specified for Tuple DataSets.
     *
     * 
The field position keys specify the fields of Tuples on which the DataSet is grouped. This
     * method returns an {@link UnsortedGrouping} on which one of the following grouping
     * transformation can be applied.
     *
     * 

     *   {@link UnsortedGrouping#sortGroup(int, org.apache.flink.api.common.operators.Order)} to
     *       get a {@link SortedGrouping}.
     *   
{@link UnsortedGrouping#aggregate(Aggregations, int)} to apply an Aggregate
     *       transformation.
     *   
{@link UnsortedGrouping#reduce(org.apache.flink.api.common.functions.ReduceFunction)}
     *       to apply a Reduce transformation.
     *   
{@link
     *       UnsortedGrouping#reduceGroup(org.apache.flink.api.common.functions.GroupReduceFunction)}
     *       to apply a GroupReduce transformation.
     * 
     *
     * @param fields One or more field positions on which the DataSet will be grouped.
     * @return A Grouping on which a transformation needs to be applied to obtain a transformed
     *     DataSet.
     * @see Tuple
     * @see UnsortedGrouping
     * @see AggregateOperator
     * @see ReduceOperator
     * @see org.apache.flink.api.java.operators.GroupReduceOperator
     * @see DataSet
     */
    public UnsortedGrouping groupBy(int... fields) {
        return new UnsortedGrouping<>(this, new Keys.ExpressionKeys<>(fields, getType()));
    }

    /**
     * Groups a {@link DataSet} using field expressions. A field expression is either the name of a
     * public field or a getter method with parentheses of the {@link DataSet}S underlying type. A
     * dot can be used to drill down into objects, as in {@code "field1.getInnerField2()" }. This
     * method returns an {@link UnsortedGrouping} on which one of the following grouping
     * transformation can be applied.
     *
     * 
     *   {@link UnsortedGrouping#sortGroup(int, org.apache.flink.api.common.operators.Order)} to
     *       get a {@link SortedGrouping}.
     *   
{@link UnsortedGrouping#aggregate(Aggregations, int)} to apply an Aggregate
     *       transformation.
     *   
{@link UnsortedGrouping#reduce(org.apache.flink.api.common.functions.ReduceFunction)}
     *       to apply a Reduce transformation.
     *   
{@link
     *       UnsortedGrouping#reduceGroup(org.apache.flink.api.common.functions.GroupReduceFunction)}
     *       to apply a GroupReduce transformation.
     * 
     *
     * @param fields One or more field expressions on which the DataSet will be grouped.
     * @return A Grouping on which a transformation needs to be applied to obtain a transformed
     *     DataSet.
     * @see Tuple
     * @see UnsortedGrouping
     * @see AggregateOperator
     * @see ReduceOperator
     * @see org.apache.flink.api.java.operators.GroupReduceOperator
     * @see DataSet
     */
    public UnsortedGrouping groupBy(String... fields) {
        return new UnsortedGrouping<>(this, new Keys.ExpressionKeys<>(fields, getType()));
    }

    // --------------------------------------------------------------------------------------------
    //  Joining
    // --------------------------------------------------------------------------------------------

    /**
     * Initiates a Join transformation.
     *
     * A Join transformation joins the elements of two {@link DataSet DataSets} on key equality
     * and provides multiple ways to combine joining elements into one DataSet.
     *
     * 
This method returns a {@link JoinOperatorSets} on which one of the {@code where} methods
     * can be called to define the join key of the first joining (i.e., this) DataSet.
     *
     * @param other The other DataSet with which this DataSet is joined.
     * @return A JoinOperatorSets to continue the definition of the Join transformation.
     * @see JoinOperatorSets
     * @see DataSet
     */
    public  JoinOperatorSets join(DataSet other) {
        return new JoinOperatorSets<>(this, other);
    }

    /**
     * Initiates a Join transformation.
     *
     * 
A Join transformation joins the elements of two {@link DataSet DataSets} on key equality
     * and provides multiple ways to combine joining elements into one DataSet.
     *
     * 
This method returns a {@link JoinOperatorSets} on which one of the {@code where} methods
     * can be called to define the join key of the first joining (i.e., this) DataSet.
     *
     * @param other The other DataSet with which this DataSet is joined.
     * @param strategy The strategy that should be used execute the join. If {@code null} is given,
     *     then the optimizer will pick the join strategy.
     * @return A JoinOperatorSets to continue the definition of the Join transformation.
     * @see JoinOperatorSets
     * @see DataSet
     */
    public  JoinOperatorSets join(DataSet other, JoinHint strategy) {
        return new JoinOperatorSets<>(this, other, strategy);
    }

    /**
     * Initiates a Join transformation.
     *
     * 
A Join transformation joins the elements of two {@link DataSet DataSets} on key equality
     * and provides multiple ways to combine joining elements into one DataSet.
     *
     * 
This method also gives the hint to the optimizer that the second DataSet to join is much
     * smaller than the first one.
     *
     * 
This method returns a {@link JoinOperatorSets} on which {@link
     * JoinOperatorSets#where(String...)} needs to be called to define the join key of the first
     * joining (i.e., this) DataSet.
     *
     * @param other The other DataSet with which this DataSet is joined.
     * @return A JoinOperatorSets to continue the definition of the Join transformation.
     * @see JoinOperatorSets
     * @see DataSet
     */
    public  JoinOperatorSets joinWithTiny(DataSet other) {
        return new JoinOperatorSets<>(this, other, JoinHint.BROADCAST_HASH_SECOND);
    }

    /**
     * Initiates a Join transformation.
     *
     * 
A Join transformation joins the elements of two {@link DataSet DataSets} on key equality
     * and provides multiple ways to combine joining elements into one DataSet.
     *
     * 
This method also gives the hint to the optimizer that the second DataSet to join is much
     * larger than the first one.
     *
     * 
This method returns a {@link JoinOperatorSets} on which one of the {@code where} methods
     * can be called to define the join key of the first joining (i.e., this) DataSet.
     *
     * @param other The other DataSet with which this DataSet is joined.
     * @return A JoinOperatorSet to continue the definition of the Join transformation.
     * @see JoinOperatorSets
     * @see DataSet
     */
    public  JoinOperatorSets joinWithHuge(DataSet other) {
        return new JoinOperatorSets<>(this, other, JoinHint.BROADCAST_HASH_FIRST);
    }

    /**
     * Initiates a Left Outer Join transformation.
     *
     * 
An Outer Join transformation joins two elements of two {@link DataSet DataSets} on key
     * equality and provides multiple ways to combine joining elements into one DataSet.
     *
     * 
Elements of the left DataSet (i.e. {@code this}) that do not have a matching
     * element on the other side are joined with {@code null} and emitted to the resulting DataSet.
     *
     * @param other The other DataSet with which this DataSet is joined.
     * @return A JoinOperatorSet to continue the definition of the Join transformation.
     * @see org.apache.flink.api.java.operators.join.JoinOperatorSetsBase
     * @see DataSet
     */
    public  JoinOperatorSetsBase leftOuterJoin(DataSet other) {
        return new JoinOperatorSetsBase<>(
                this, other, JoinHint.OPTIMIZER_CHOOSES, JoinType.LEFT_OUTER);
    }

    /**
     * Initiates a Left Outer Join transformation.
     *
     * 
An Outer Join transformation joins two elements of two {@link DataSet DataSets} on key
     * equality and provides multiple ways to combine joining elements into one DataSet.
     *
     * 
Elements of the left DataSet (i.e. {@code this}) that do not have a matching
     * element on the other side are joined with {@code null} and emitted to the resulting DataSet.
     *
     * @param other The other DataSet with which this DataSet is joined.
     * @param strategy The strategy that should be used execute the join. If {@code null} is given,
     *     then the optimizer will pick the join strategy.
     * @return A JoinOperatorSet to continue the definition of the Join transformation.
     * @see org.apache.flink.api.java.operators.join.JoinOperatorSetsBase
     * @see DataSet
     */
    public  JoinOperatorSetsBase leftOuterJoin(DataSet other, JoinHint strategy) {
        switch (strategy) {
            case OPTIMIZER_CHOOSES:
            case REPARTITION_SORT_MERGE:
            case REPARTITION_HASH_FIRST:
            case REPARTITION_HASH_SECOND:
            case BROADCAST_HASH_SECOND:
                return new JoinOperatorSetsBase<>(this, other, strategy, JoinType.LEFT_OUTER);
            default:
                throw new InvalidProgramException(
                        "Invalid JoinHint for LeftOuterJoin: " + strategy);
        }
    }

    /**
     * Initiates a Right Outer Join transformation.
     *
     * 
An Outer Join transformation joins two elements of two {@link DataSet DataSets} on key
     * equality and provides multiple ways to combine joining elements into one DataSet.
     *
     * 
Elements of the right DataSet (i.e. {@code other}) that do not have a matching
     * element on {@code this} side are joined with {@code null} and emitted to the resulting
     * DataSet.
     *
     * @param other The other DataSet with which this DataSet is joined.
     * @return A JoinOperatorSet to continue the definition of the Join transformation.
     * @see org.apache.flink.api.java.operators.join.JoinOperatorSetsBase
     * @see DataSet
     */
    public  JoinOperatorSetsBase rightOuterJoin(DataSet other) {
        return new JoinOperatorSetsBase<>(
                this, other, JoinHint.OPTIMIZER_CHOOSES, JoinType.RIGHT_OUTER);
    }

    /**
     * Initiates a Right Outer Join transformation.
     *
     * 
An Outer Join transformation joins two elements of two {@link DataSet DataSets} on key
     * equality and provides multiple ways to combine joining elements into one DataSet.
     *
     * 
Elements of the right DataSet (i.e. {@code other}) that do not have a matching
     * element on {@code this} side are joined with {@code null} and emitted to the resulting
     * DataSet.
     *
     * @param other The other DataSet with which this DataSet is joined.
     * @param strategy The strategy that should be used execute the join. If {@code null} is given,
     *     then the optimizer will pick the join strategy.
     * @return A JoinOperatorSet to continue the definition of the Join transformation.
     * @see org.apache.flink.api.java.operators.join.JoinOperatorSetsBase
     * @see DataSet
     */
    public  JoinOperatorSetsBase rightOuterJoin(DataSet other, JoinHint strategy) {
        switch (strategy) {
            case OPTIMIZER_CHOOSES:
            case REPARTITION_SORT_MERGE:
            case REPARTITION_HASH_FIRST:
            case REPARTITION_HASH_SECOND:
            case BROADCAST_HASH_FIRST:
                return new JoinOperatorSetsBase<>(this, other, strategy, JoinType.RIGHT_OUTER);
            default:
                throw new InvalidProgramException(
                        "Invalid JoinHint for RightOuterJoin: " + strategy);
        }
    }

    /**
     * Initiates a Full Outer Join transformation.
     *
     * 
An Outer Join transformation joins two elements of two {@link DataSet DataSets} on key
     * equality and provides multiple ways to combine joining elements into one DataSet.
     *
     * 
Elements of both DataSets that do not have a matching element on the opposing side
     * are joined with {@code null} and emitted to the resulting DataSet.
     *
     * @param other The other DataSet with which this DataSet is joined.
     * @return A JoinOperatorSet to continue the definition of the Join transformation.
     * @see org.apache.flink.api.java.operators.join.JoinOperatorSetsBase
     * @see DataSet
     */
    public  JoinOperatorSetsBase fullOuterJoin(DataSet other) {
        return new JoinOperatorSetsBase<>(
                this, other, JoinHint.OPTIMIZER_CHOOSES, JoinType.FULL_OUTER);
    }

    /**
     * Initiates a Full Outer Join transformation.
     *
     * 
An Outer Join transformation joins two elements of two {@link DataSet DataSets} on key
     * equality and provides multiple ways to combine joining elements into one DataSet.
     *
     * 
Elements of both DataSets that do not have a matching element on the opposing side
     * are joined with {@code null} and emitted to the resulting DataSet.
     *
     * @param other The other DataSet with which this DataSet is joined.
     * @param strategy The strategy that should be used execute the join. If {@code null} is given,
     *     then the optimizer will pick the join strategy.
     * @return A JoinOperatorSet to continue the definition of the Join transformation.
     * @see org.apache.flink.api.java.operators.join.JoinOperatorSetsBase
     * @see DataSet
     */
    public  JoinOperatorSetsBase fullOuterJoin(DataSet other, JoinHint strategy) {
        switch (strategy) {
            case OPTIMIZER_CHOOSES:
            case REPARTITION_SORT_MERGE:
            case REPARTITION_HASH_FIRST:
            case REPARTITION_HASH_SECOND:
                return new JoinOperatorSetsBase<>(this, other, strategy, JoinType.FULL_OUTER);
            default:
                throw new InvalidProgramException(
                        "Invalid JoinHint for FullOuterJoin: " + strategy);
        }
    }

    // --------------------------------------------------------------------------------------------
    //  Co-Grouping
    // --------------------------------------------------------------------------------------------

    /**
     * Initiates a CoGroup transformation.
     *
     * 
A CoGroup transformation combines the elements of two {@link DataSet DataSets} into one
     * DataSet. It groups each DataSet individually on a key and gives groups of both DataSets with
     * equal keys together into a {@link org.apache.flink.api.common.functions.RichCoGroupFunction}.
     * If a DataSet has a group with no matching key in the other DataSet, the CoGroupFunction is
     * called with an empty group for the non-existing group.
     *
     * 
The CoGroupFunction can iterate over the elements of both groups and return any number of
     * elements including none.
     *
     * 
This method returns a {@link CoGroupOperatorSets} on which one of the {@code where}
     * methods can be called to define the join key of the first joining (i.e., this) DataSet.
     *
     * @param other The other DataSet of the CoGroup transformation.
     * @return A CoGroupOperatorSets to continue the definition of the CoGroup transformation.
     * @see CoGroupOperatorSets
     * @see CoGroupOperator
     * @see DataSet
     */
    public  CoGroupOperator.CoGroupOperatorSets coGroup(DataSet other) {
        return new CoGroupOperator.CoGroupOperatorSets<>(this, other);
    }

    // --------------------------------------------------------------------------------------------
    //  Cross
    // --------------------------------------------------------------------------------------------

    /**
     * Continues a Join transformation and defines the {@link Tuple} fields of the second join
     * {@link DataSet} that should be used as join keys.
     *
     * 
Note: Fields can only be selected as join keys on Tuple DataSets.
     *
     * 
The resulting {@link DefaultJoin} wraps each pair of joining elements into a {@link
     * Tuple2}, with the element of the first input being the first field of the tuple and the
     * element of the second input being the second field of the tuple.
     *
     * @param fields The indexes of the Tuple fields of the second join DataSet that should be used
     *     as keys.
     * @return A DefaultJoin that represents the joined DataSet.
     */

    /**
     * Initiates a Cross transformation.
     *
     * 
A Cross transformation combines the elements of two {@link DataSet DataSets} into one
     * DataSet. It builds all pair combinations of elements of both DataSets, i.e., it builds a
     * Cartesian product.
     *
     * 
The resulting {@link org.apache.flink.api.java.operators.CrossOperator.DefaultCross} wraps
     * each pair of crossed elements into a {@link Tuple2}, with the element of the first input
     * being the first field of the tuple and the element of the second input being the second field
     * of the tuple.
     *
     * 
Call {@link
     * org.apache.flink.api.java.operators.CrossOperator.DefaultCross#with(org.apache.flink.api.common.functions.CrossFunction)}
     * to define a {@link org.apache.flink.api.common.functions.CrossFunction} which is called for
     * each pair of crossed elements. The CrossFunction returns a exactly one element for each pair
     * of input elements.
     *
     * @param other The other DataSet with which this DataSet is crossed.
     * @return A DefaultCross that returns a Tuple2 for each pair of crossed elements.
     * @see org.apache.flink.api.java.operators.CrossOperator.DefaultCross
     * @see org.apache.flink.api.common.functions.CrossFunction
     * @see DataSet
     * @see Tuple2
     */
    public  CrossOperator.DefaultCross cross(DataSet other) {
        return new CrossOperator.DefaultCross<>(
                this, other, CrossHint.OPTIMIZER_CHOOSES, Utils.getCallLocationName());
    }

    /**
     * Initiates a Cross transformation.
     *
     * 
A Cross transformation combines the elements of two {@link DataSet DataSets} into one
     * DataSet. It builds all pair combinations of elements of both DataSets, i.e., it builds a
     * Cartesian product. This method also gives the hint to the optimizer that the second DataSet
     * to cross is much smaller than the first one.
     *
     * 
The resulting {@link org.apache.flink.api.java.operators.CrossOperator.DefaultCross} wraps
     * each pair of crossed elements into a {@link Tuple2}, with the element of the first input
     * being the first field of the tuple and the element of the second input being the second field
     * of the tuple.
     *
     * 
Call {@link
     * org.apache.flink.api.java.operators.CrossOperator.DefaultCross#with(org.apache.flink.api.common.functions.CrossFunction)}
     * to define a {@link org.apache.flink.api.common.functions.CrossFunction} which is called for
     * each pair of crossed elements. The CrossFunction returns a exactly one element for each pair
     * of input elements.
     *
     * @param other The other DataSet with which this DataSet is crossed.
     * @return A DefaultCross that returns a Tuple2 for each pair of crossed elements.
     * @see org.apache.flink.api.java.operators.CrossOperator.DefaultCross
     * @see org.apache.flink.api.common.functions.CrossFunction
     * @see DataSet
     * @see Tuple2
     */
    public  CrossOperator.DefaultCross crossWithTiny(DataSet other) {
        return new CrossOperator.DefaultCross<>(
                this, other, CrossHint.SECOND_IS_SMALL, Utils.getCallLocationName());
    }

    /**
     * Initiates a Cross transformation.
     *
     * 
A Cross transformation combines the elements of two {@link DataSet DataSets} into one
     * DataSet. It builds all pair combinations of elements of both DataSets, i.e., it builds a
     * Cartesian product. This method also gives the hint to the optimizer that the second DataSet
     * to cross is much larger than the first one.
     *
     * 
The resulting {@link org.apache.flink.api.java.operators.CrossOperator.DefaultCross} wraps
     * each pair of crossed elements into a {@link Tuple2}, with the element of the first input
     * being the first field of the tuple and the element of the second input being the second field
     * of the tuple.
     *
     * 
Call {@link
     * org.apache.flink.api.java.operators.CrossOperator.DefaultCross#with(org.apache.flink.api.common.functions.CrossFunction)}
     * to define a {@link org.apache.flink.api.common.functions.CrossFunction} which is called for
     * each pair of crossed elements. The CrossFunction returns a exactly one element for each pair
     * of input elements.
     *
     * @param other The other DataSet with which this DataSet is crossed.
     * @return A DefaultCross that returns a Tuple2 for each pair of crossed elements.
     * @see org.apache.flink.api.java.operators.CrossOperator.DefaultCross
     * @see org.apache.flink.api.common.functions.CrossFunction
     * @see DataSet
     * @see Tuple2
     */
    public  CrossOperator.DefaultCross crossWithHuge(DataSet other) {
        return new CrossOperator.DefaultCross<>(
                this, other, CrossHint.FIRST_IS_SMALL, Utils.getCallLocationName());
    }

    // --------------------------------------------------------------------------------------------
    //  Iterations
    // --------------------------------------------------------------------------------------------

    /**
     * Initiates an iterative part of the program that executes multiple times and feeds back data
     * sets. The iterative part needs to be closed by calling {@link
     * org.apache.flink.api.java.operators.IterativeDataSet#closeWith(DataSet)}. The data set given
     * to the {@code closeWith(DataSet)} method is the data set that will be fed back and used as
     * the input to the next iteration. The return value of the {@code closeWith(DataSet)} method is
     * the resulting data set after the iteration has terminated.
     *
     * 
An example of an iterative computation is as follows:
     *
     * 
{@code
     * DataSet input = ...;
     *
     * DataSet startOfIteration = input.iterate(10);
     * DataSet toBeFedBack = startOfIteration
     *                               .map(new MyMapper())
     *                               .groupBy(...).reduceGroup(new MyReducer());
     * DataSet result = startOfIteration.closeWith(toBeFedBack);
     * }
     *
     * The iteration has a maximum number of times that it executes. A dynamic termination can be
     * realized by using a termination criterion (see {@link
     * org.apache.flink.api.java.operators.IterativeDataSet#closeWith(DataSet, DataSet)}).
     *
     * @param maxIterations The maximum number of times that the iteration is executed.
     * @return An IterativeDataSet that marks the start of the iterative part and needs to be closed
     *     by {@link org.apache.flink.api.java.operators.IterativeDataSet#closeWith(DataSet)}.
     * @see org.apache.flink.api.java.operators.IterativeDataSet
     */
    public IterativeDataSet iterate(int maxIterations) {
        return new IterativeDataSet<>(getExecutionEnvironment(), getType(), this, maxIterations);
    }

    /**
     * Initiates a delta iteration. A delta iteration is similar to a regular iteration (as started
     * by {@link #iterate(int)}, but maintains state across the individual iteration steps. The
     * Solution set, which represents the current state at the beginning of each iteration can be
     * obtained via {@link org.apache.flink.api.java.operators.DeltaIteration#getSolutionSet()} ()}.
     * It can be be accessed by joining (or CoGrouping) with it. The DataSet that represents the
     * workset of an iteration can be obtained via {@link
     * org.apache.flink.api.java.operators.DeltaIteration#getWorkset()}. The solution set is updated
     * by producing a delta for it, which is merged into the solution set at the end of each
     * iteration step.
     *
     * 
The delta iteration must be closed by calling {@link
     * org.apache.flink.api.java.operators.DeltaIteration#closeWith(DataSet, DataSet)}. The two
     * parameters are the delta for the solution set and the new workset (the data set that will be
     * fed back). The return value of the {@code closeWith(DataSet, DataSet)} method is the
     * resulting data set after the iteration has terminated. Delta iterations terminate when the
     * feed back data set (the workset) is empty. In addition, a maximum number of steps is given as
     * a fall back termination guard.
     *
     * 
Elements in the solution set are uniquely identified by a key. When merging the solution
     * set delta, contained elements with the same key are replaced.
     *
     * 
NOTE: Delta iterations currently support only tuple valued data types. This
     * restriction will be removed in the future. The key is specified by the tuple position.
     *
     * 
A code example for a delta iteration is as follows
     *
     * 
{@code
     * DeltaIteration, Tuple2> iteration =
     *                                                  initialState.iterateDelta(initialFeedbackSet, 100, 0);
     *
     * DataSet> delta = iteration.groupBy(0).aggregate(Aggregations.AVG, 1)
     *                                              .join(iteration.getSolutionSet()).where(0).equalTo(0)
     *                                              .flatMap(new ProjectAndFilter());
     *
     * DataSet> feedBack = delta.join(someOtherSet).where(...).equalTo(...).with(...);
     *
     * // close the delta iteration (delta and new workset are identical)
     * DataSet> result = iteration.closeWith(delta, feedBack);
     * }
     *
     * @param workset The initial version of the data set that is fed back to the next iteration
     *     step (the workset).
     * @param maxIterations The maximum number of iteration steps, as a fall back safeguard.
     * @param keyPositions The position of the tuple fields that is used as the key of the solution
     *     set.
     * @return The DeltaIteration that marks the start of a delta iteration.
     * @see org.apache.flink.api.java.operators.DeltaIteration
     */
    public  DeltaIteration iterateDelta(
            DataSet workset, int maxIterations, int... keyPositions) {
        Preconditions.checkNotNull(workset);
        Preconditions.checkNotNull(keyPositions);

        Keys.ExpressionKeys keys = new Keys.ExpressionKeys<>(keyPositions, getType());
        return new DeltaIteration<>(
                getExecutionEnvironment(), getType(), this, workset, keys, maxIterations);
    }

    // --------------------------------------------------------------------------------------------
    //  Custom Operators
    // -------------------------------------------------------------------------------------------

    /**
     * Runs a {@link CustomUnaryOperation} on the data set. Custom operations are typically complex
     * operators that are composed of multiple steps.
     *
     * @param operation The operation to run.
     * @return The data set produced by the operation.
     */
    public  DataSet runOperation(CustomUnaryOperation operation) {
        Preconditions.checkNotNull(operation, "The custom operator must not be null.");
        operation.setInput(this);
        return operation.createResult();
    }

    // --------------------------------------------------------------------------------------------
    //  Union
    // --------------------------------------------------------------------------------------------

    /**
     * Creates a union of this DataSet with an other DataSet. The other DataSet must be of the same
     * data type.
     *
     * @param other The other DataSet which is unioned with the current DataSet.
     * @return The resulting DataSet.
     */
    public UnionOperator union(DataSet other) {
        return new UnionOperator<>(this, other, Utils.getCallLocationName());
    }

    // --------------------------------------------------------------------------------------------
    //  Partitioning
    // --------------------------------------------------------------------------------------------

    /**
     * Hash-partitions a DataSet on the specified key fields.
     *
     * Important:This operation shuffles the whole DataSet over the network and can take
     * significant amount of time.
     *
     * @param fields The field indexes on which the DataSet is hash-partitioned.
     * @return The partitioned DataSet.
     */
    public PartitionOperator partitionByHash(int... fields) {
        return new PartitionOperator<>(
                this,
                PartitionMethod.HASH,
                new Keys.ExpressionKeys<>(fields, getType()),
                Utils.getCallLocationName());
    }

    /**
     * Hash-partitions a DataSet on the specified key fields.
     *
     * 
Important:This operation shuffles the whole DataSet over the network and can take
     * significant amount of time.
     *
     * @param fields The field expressions on which the DataSet is hash-partitioned.
     * @return The partitioned DataSet.
     */
    public PartitionOperator partitionByHash(String... fields) {
        return new PartitionOperator<>(
                this,
                PartitionMethod.HASH,
                new Keys.ExpressionKeys<>(fields, getType()),
                Utils.getCallLocationName());
    }

    /**
     * Partitions a DataSet using the specified KeySelector.
     *
     * 
Important:This operation shuffles the whole DataSet over the network and can take
     * significant amount of time.
     *
     * @param keyExtractor The KeyExtractor with which the DataSet is hash-partitioned.
     * @return The partitioned DataSet.
     * @see KeySelector
     */
    public > PartitionOperator partitionByHash(
            KeySelector keyExtractor) {
        final TypeInformation keyType =
                TypeExtractor.getKeySelectorTypes(keyExtractor, getType());
        return new PartitionOperator<>(
                this,
                PartitionMethod.HASH,
                new Keys.SelectorFunctionKeys<>(clean(keyExtractor), this.getType(), keyType),
                Utils.getCallLocationName());
    }

    /**
     * Range-partitions a DataSet on the specified key fields.
     *
     * 
Important:This operation requires an extra pass over the DataSet to compute the
     * range boundaries and shuffles the whole DataSet over the network. This can take significant
     * amount of time.
     *
     * @param fields The field indexes on which the DataSet is range-partitioned.
     * @return The partitioned DataSet.
     */
    public PartitionOperator partitionByRange(int... fields) {
        return new PartitionOperator<>(
                this,
                PartitionMethod.RANGE,
                new Keys.ExpressionKeys<>(fields, getType()),
                Utils.getCallLocationName());
    }

    /**
     * Range-partitions a DataSet on the specified key fields.
     *
     * 
Important:This operation requires an extra pass over the DataSet to compute the
     * range boundaries and shuffles the whole DataSet over the network. This can take significant
     * amount of time.
     *
     * @param fields The field expressions on which the DataSet is range-partitioned.
     * @return The partitioned DataSet.
     */
    public PartitionOperator partitionByRange(String... fields) {
        return new PartitionOperator<>(
                this,
                PartitionMethod.RANGE,
                new Keys.ExpressionKeys<>(fields, getType()),
                Utils.getCallLocationName());
    }

    /**
     * Range-partitions a DataSet using the specified KeySelector.
     *
     * 
Important:This operation requires an extra pass over the DataSet to compute the
     * range boundaries and shuffles the whole DataSet over the network. This can take significant
     * amount of time.
     *
     * @param keyExtractor The KeyExtractor with which the DataSet is range-partitioned.
     * @return The partitioned DataSet.
     * @see KeySelector
     */
    public > PartitionOperator partitionByRange(
            KeySelector keyExtractor) {
        final TypeInformation keyType =
                TypeExtractor.getKeySelectorTypes(keyExtractor, getType());
        return new PartitionOperator<>(
                this,
                PartitionMethod.RANGE,
                new Keys.SelectorFunctionKeys<>(clean(keyExtractor), this.getType(), keyType),
                Utils.getCallLocationName());
    }

    /**
     * Partitions a tuple DataSet on the specified key fields using a custom partitioner. This
     * method takes the key position to partition on, and a partitioner that accepts the key type.
     *
     * 
Note: This method works only on single field keys.
     *
     * @param partitioner The partitioner to assign partitions to keys.
     * @param field The field index on which the DataSet is to partitioned.
     * @return The partitioned DataSet.
     */
    public  PartitionOperator partitionCustom(Partitioner partitioner, int field) {
        return new PartitionOperator<>(
                this,
                new Keys.ExpressionKeys<>(new int[] {field}, getType()),
                clean(partitioner),
                Utils.getCallLocationName());
    }

    /**
     * Partitions a POJO DataSet on the specified key fields using a custom partitioner. This method
     * takes the key expression to partition on, and a partitioner that accepts the key type.
     *
     * 
Note: This method works only on single field keys.
     *
     * @param partitioner The partitioner to assign partitions to keys.
     * @param field The field index on which the DataSet is to partitioned.
     * @return The partitioned DataSet.
     */
    public  PartitionOperator partitionCustom(Partitioner partitioner, String field) {
        return new PartitionOperator<>(
                this,
                new Keys.ExpressionKeys<>(new String[] {field}, getType()),
                clean(partitioner),
                Utils.getCallLocationName());
    }

    /**
     * Partitions a DataSet on the key returned by the selector, using a custom partitioner. This
     * method takes the key selector to get the key to partition on, and a partitioner that accepts
     * the key type.
     *
     * 
Note: This method works only on single field keys, i.e. the selector cannot return tuples
     * of fields.
     *
     * @param partitioner The partitioner to assign partitions to keys.
     * @param keyExtractor The KeyExtractor with which the DataSet is partitioned.
     * @return The partitioned DataSet.
     * @see KeySelector
     */
    public > PartitionOperator partitionCustom(
            Partitioner partitioner, KeySelector keyExtractor) {
        final TypeInformation keyType =
                TypeExtractor.getKeySelectorTypes(keyExtractor, getType());
        return new PartitionOperator<>(
                this,
                new Keys.SelectorFunctionKeys<>(keyExtractor, getType(), keyType),
                clean(partitioner),
                Utils.getCallLocationName());
    }

    /**
     * Enforces a re-balancing of the DataSet, i.e., the DataSet is evenly distributed over all
     * parallel instances of the following task. This can help to improve performance in case of
     * heavy data skew and compute intensive operations.
     *
     * 
Important:This operation shuffles the whole DataSet over the network and can take
     * significant amount of time.
     *
     * @return The re-balanced DataSet.
     */
    public PartitionOperator rebalance() {
        return new PartitionOperator<>(
                this, PartitionMethod.REBALANCE, Utils.getCallLocationName());
    }

    // --------------------------------------------------------------------------------------------
    //  Sorting
    // --------------------------------------------------------------------------------------------

    /**
     * Locally sorts the partitions of the DataSet on the specified field in the specified order.
     * DataSet can be sorted on multiple fields by chaining sortPartition() calls.
     *
     * @param field The field index on which the DataSet is sorted.
     * @param order The order in which the DataSet is sorted.
     * @return The DataSet with sorted local partitions.
     */
    public SortPartitionOperator sortPartition(int field, Order order) {
        return new SortPartitionOperator<>(this, field, order, Utils.getCallLocationName());
    }

    /**
     * Locally sorts the partitions of the DataSet on the specified field in the specified order.
     * DataSet can be sorted on multiple fields by chaining sortPartition() calls.
     *
     * @param field The field expression referring to the field on which the DataSet is sorted.
     * @param order The order in which the DataSet is sorted.
     * @return The DataSet with sorted local partitions.
     */
    public SortPartitionOperator sortPartition(String field, Order order) {
        return new SortPartitionOperator<>(this, field, order, Utils.getCallLocationName());
    }

    /**
     * Locally sorts the partitions of the DataSet on the extracted key in the specified order. The
     * DataSet can be sorted on multiple values by returning a tuple from the KeySelector.
     *
     * 
Note that no additional sort keys can be appended to a KeySelector sort keys. To sort the
     * partitions by multiple values using KeySelector, the KeySelector must return a tuple
     * consisting of the values.
     *
     * @param keyExtractor The KeySelector function which extracts the key values from the DataSet
     *     on which the DataSet is sorted.
     * @param order The order in which the DataSet is sorted.
     * @return The DataSet with sorted local partitions.
     */
    public  SortPartitionOperator sortPartition(KeySelector keyExtractor, Order order) {
        final TypeInformation keyType =
                TypeExtractor.getKeySelectorTypes(keyExtractor, getType());
        return new SortPartitionOperator<>(
                this,
                new Keys.SelectorFunctionKeys<>(clean(keyExtractor), getType(), keyType),
                order,
                Utils.getCallLocationName());
    }

    // --------------------------------------------------------------------------------------------
    //  Top-K
    // --------------------------------------------------------------------------------------------

    // --------------------------------------------------------------------------------------------
    //  Result writing
    // --------------------------------------------------------------------------------------------

    /**
     * Writes a DataSet as text file(s) to the specified location.
     *
     * 
For each element of the DataSet the result of {@link Object#toString()} is written.

     * 

     * Output files and directories

     * What output how writeAsText() method produces is depending on other circumstance
     *
     * 

     *   A directory is created and multiple files are written underneath. (Default behavior)
     *       

     *       This sink creates a directory called "path1", and files "1", "2" ... are writen
     *       underneath depending on parallelism
     *       {@code .
     * └── path1/
     *     ├── 1
     *     ├── 2
     *     └── ...}
     *       Code Example
     *       {@code dataset.writeAsText("file:///path1");}
     *   
A single file called "path1" is created when parallelism is set to 1
     *       {@code .
     * └── path1 }
     *       Code Example
     *       {@code // Parallelism is set to only this particular operation
     * dataset.writeAsText("file:///path1").setParallelism(1);
     *
     * // This will creates the same effect but note all operators' parallelism are set to one
     * env.setParallelism(1);
     * ...
     * dataset.writeAsText("file:///path1"); }
     *   
A directory is always created when fs.output.always-create-directory
     *       is set to true in flink-conf.yaml file, even when parallelism is set to 1.
     *       {@code .
     * └── path1/
     *     └── 1 }
     *       Code Example
     *       {@code // fs.output.always-create-directory = true
     * dataset.writeAsText("file:///path1").setParallelism(1); }
     * 
     *
     * @param filePath The path pointing to the location the text file or files under the directory
     *     is written to.
     * @return The DataSink that writes the DataSet.
     * @see TextOutputFormat
     */
    public DataSink writeAsText(String filePath) {
        return output(new TextOutputFormat(new Path(filePath)));
    }

    /**
     * Writes a DataSet as text file(s) to the specified location.
     *
     * For each element of the DataSet the result of {@link Object#toString()} is written.
     *
     * @param filePath The path pointing to the location the text file is written to.
     * @param writeMode Control the behavior for existing files. Options are NO_OVERWRITE and
     *     OVERWRITE.
     * @return The DataSink that writes the DataSet.
     * @see TextOutputFormat
     * @see DataSet#writeAsText(String) Output files and directories
     */
    public DataSink writeAsText(String filePath, WriteMode writeMode) {
        TextOutputFormat tof = new TextOutputFormat<>(new Path(filePath));
        tof.setWriteMode(writeMode);
        return output(tof);
    }

    /**
     * Writes a DataSet as text file(s) to the specified location.
     *
     * 
For each element of the DataSet the result of {@link TextFormatter#format(Object)} is
     * written.
     *
     * @param filePath The path pointing to the location the text file is written to.
     * @param formatter formatter that is applied on every element of the DataSet.
     * @return The DataSink that writes the DataSet.
     * @see TextOutputFormat
     * @see DataSet#writeAsText(String) Output files and directories
     */
    public DataSink writeAsFormattedText(String filePath, TextFormatter formatter) {
        return map(new FormattingMapper<>(clean(formatter))).writeAsText(filePath);
    }

    /**
     * Writes a DataSet as text file(s) to the specified location.
     *
     * 
For each element of the DataSet the result of {@link TextFormatter#format(Object)} is
     * written.
     *
     * @param filePath The path pointing to the location the text file is written to.
     * @param writeMode Control the behavior for existing files. Options are NO_OVERWRITE and
     *     OVERWRITE.
     * @param formatter formatter that is applied on every element of the DataSet.
     * @return The DataSink that writes the DataSet.
     * @see TextOutputFormat
     * @see DataSet#writeAsText(String) Output files and directories
     */
    public DataSink writeAsFormattedText(
            String filePath, WriteMode writeMode, TextFormatter formatter) {
        return map(new FormattingMapper<>(clean(formatter))).writeAsText(filePath, writeMode);
    }

    /**
     * Writes a {@link Tuple} DataSet as CSV file(s) to the specified location.
     *
     * 
Note: Only a Tuple DataSet can written as a CSV file.
     *
     * 
For each Tuple field the result of {@link Object#toString()} is written. Tuple fields are
     * separated by the default field delimiter {@code "comma" (,)}.
     *
     * 
Tuples are are separated by the newline character ({@code \n}).
     *
     * @param filePath The path pointing to the location the CSV file is written to.
     * @return The DataSink that writes the DataSet.
     * @see Tuple
     * @see CsvOutputFormat
     * @see DataSet#writeAsText(String) Output files and directories
     */
    public DataSink writeAsCsv(String filePath) {
        return writeAsCsv(
                filePath,
                CsvOutputFormat.DEFAULT_LINE_DELIMITER,
                CsvOutputFormat.DEFAULT_FIELD_DELIMITER);
    }

    /**
     * Writes a {@link Tuple} DataSet as CSV file(s) to the specified location.
     *
     * 
Note: Only a Tuple DataSet can written as a CSV file.
     *
     * 
For each Tuple field the result of {@link Object#toString()} is written. Tuple fields are
     * separated by the default field delimiter {@code "comma" (,)}.
     *
     * 
Tuples are are separated by the newline character ({@code \n}).
     *
     * @param filePath The path pointing to the location the CSV file is written to.
     * @param writeMode The behavior regarding existing files. Options are NO_OVERWRITE and
     *     OVERWRITE.
     * @return The DataSink that writes the DataSet.
     * @see Tuple
     * @see CsvOutputFormat
     * @see DataSet#writeAsText(String) Output files and directories
     */
    public DataSink writeAsCsv(String filePath, WriteMode writeMode) {
        return internalWriteAsCsv(
                new Path(filePath),
                CsvOutputFormat.DEFAULT_LINE_DELIMITER,
                CsvOutputFormat.DEFAULT_FIELD_DELIMITER,
                writeMode);
    }

    /**
     * Writes a {@link Tuple} DataSet as CSV file(s) to the specified location with the specified
     * field and line delimiters.
     *
     * 
Note: Only a Tuple DataSet can written as a CSV file.
     *
     * 
For each Tuple field the result of {@link Object#toString()} is written.
     *
     * @param filePath The path pointing to the location the CSV file is written to.
     * @param rowDelimiter The row delimiter to separate Tuples.
     * @param fieldDelimiter The field delimiter to separate Tuple fields.
     * @see Tuple
     * @see CsvOutputFormat
     * @see DataSet#writeAsText(String) Output files and directories
     */
    public DataSink writeAsCsv(String filePath, String rowDelimiter, String fieldDelimiter) {
        return internalWriteAsCsv(new Path(filePath), rowDelimiter, fieldDelimiter, null);
    }

    /**
     * Writes a {@link Tuple} DataSet as CSV file(s) to the specified location with the specified
     * field and line delimiters.
     *
     * 
Note: Only a Tuple DataSet can written as a CSV file. For each Tuple field the
     * result of {@link Object#toString()} is written.
     *
     * @param filePath The path pointing to the location the CSV file is written to.
     * @param rowDelimiter The row delimiter to separate Tuples.
     * @param fieldDelimiter The field delimiter to separate Tuple fields.
     * @param writeMode The behavior regarding existing files. Options are NO_OVERWRITE and
     *     OVERWRITE.
     * @see Tuple
     * @see CsvOutputFormat
     * @see DataSet#writeAsText(String) Output files and directories
     */
    public DataSink writeAsCsv(
            String filePath, String rowDelimiter, String fieldDelimiter, WriteMode writeMode) {
        return internalWriteAsCsv(new Path(filePath), rowDelimiter, fieldDelimiter, writeMode);
    }

    @SuppressWarnings("unchecked")
    private  DataSink internalWriteAsCsv(
            Path filePath, String rowDelimiter, String fieldDelimiter, WriteMode wm) {
        Preconditions.checkArgument(
                getType().isTupleType(),
                "The writeAsCsv() method can only be used on data sets of tuples.");
        CsvOutputFormat of = new CsvOutputFormat<>(filePath, rowDelimiter, fieldDelimiter);
        if (wm != null) {
            of.setWriteMode(wm);
        }
        return output((OutputFormat) of);
    }

    /**
     * Prints the elements in a DataSet to the standard output stream {@link System#out} of the JVM
     * that calls the print() method. For programs that are executed in a cluster, this method needs
     * to gather the contents of the DataSet back to the client, to print it there.
     *
     * 
The string written for each element is defined by the {@link Object#toString()} method.
     *
     * 
This method immediately triggers the program execution, similar to the {@link #collect()}
     * and {@link #count()} methods.
     *
     * @see #printToErr()
     * @see #printOnTaskManager(String)
     */
    public void print() throws Exception {
        List elements = collect();
        for (T e : elements) {
            System.out.println(e);
        }
    }

    /**
     * Prints the elements in a DataSet to the standard error stream {@link System#err} of the JVM
     * that calls the print() method. For programs that are executed in a cluster, this method needs
     * to gather the contents of the DataSet back to the client, to print it there.
     *
     * 
The string written for each element is defined by the {@link Object#toString()} method.
     *
     * 
This method immediately triggers the program execution, similar to the {@link #collect()}
     * and {@link #count()} methods.
     *
     * @see #print()
     * @see #printOnTaskManager(String)
     */
    public void printToErr() throws Exception {
        List elements = collect();
        for (T e : elements) {
            System.err.println(e);
        }
    }

    /**
     * Writes a DataSet to the standard output streams (stdout) of the TaskManagers that execute the
     * program (or more specifically, the data sink operators). On a typical cluster setup, the data
     * will appear in the TaskManagers' .out files.
     *
     * 
To print the data to the console or stdout stream of the client process instead, use the
     * {@link #print()} method.
     *
     * 
For each element of the DataSet the result of {@link Object#toString()} is written.
     *
     * @param prefix The string to prefix each line of the output with. This helps identifying
     *     outputs from different printing sinks.
     * @return The DataSink operator that writes the DataSet.
     * @see #print()
     */
    public DataSink printOnTaskManager(String prefix) {
        return output(new PrintingOutputFormat(prefix, false));
    }

    /**
     * Writes a DataSet to the standard output stream (stdout).
     *
     * 
For each element of the DataSet the result of {@link Object#toString()} is written.
     *
     * @param sinkIdentifier The string to prefix the output with.
     * @return The DataSink that writes the DataSet.
     * @deprecated Use {@link #printOnTaskManager(String)} instead.
     */
    @Deprecated
    @PublicEvolving
    public DataSink print(String sinkIdentifier) {
        return output(new PrintingOutputFormat(sinkIdentifier, false));
    }

    /**
     * Writes a DataSet to the standard error stream (stderr).
     *
     * For each element of the DataSet the result of {@link Object#toString()} is written.
     *
     * @param sinkIdentifier The string to prefix the output with.
     * @return The DataSink that writes the DataSet.
     * @deprecated Use {@link #printOnTaskManager(String)} instead, or the {@link
     *     PrintingOutputFormat}.
     */
    @Deprecated
    @PublicEvolving
    public DataSink printToErr(String sinkIdentifier) {
        return output(new PrintingOutputFormat(sinkIdentifier, true));
    }

    /**
     * Writes a DataSet using a {@link FileOutputFormat} to a specified location. This method adds a
     * data sink to the program.
     *
     * @param outputFormat The FileOutputFormat to write the DataSet.
     * @param filePath The path to the location where the DataSet is written.
     * @return The DataSink that writes the DataSet.
     * @see FileOutputFormat
     */
    public DataSink write(FileOutputFormat outputFormat, String filePath) {
        Preconditions.checkNotNull(filePath, "File path must not be null.");
        Preconditions.checkNotNull(outputFormat, "Output format must not be null.");

        outputFormat.setOutputFilePath(new Path(filePath));
        return output(outputFormat);
    }

    /**
     * Writes a DataSet using a {@link FileOutputFormat} to a specified location. This method adds a
     * data sink to the program.
     *
     * @param outputFormat The FileOutputFormat to write the DataSet.
     * @param filePath The path to the location where the DataSet is written.
     * @param writeMode The mode of writing, indicating whether to overwrite existing files.
     * @return The DataSink that writes the DataSet.
     * @see FileOutputFormat
     */
    public DataSink write(
            FileOutputFormat outputFormat, String filePath, WriteMode writeMode) {
        Preconditions.checkNotNull(filePath, "File path must not be null.");
        Preconditions.checkNotNull(writeMode, "Write mode must not be null.");
        Preconditions.checkNotNull(outputFormat, "Output format must not be null.");

        outputFormat.setOutputFilePath(new Path(filePath));
        outputFormat.setWriteMode(writeMode);
        return output(outputFormat);
    }

    /**
     * Emits a DataSet using an {@link OutputFormat}. This method adds a data sink to the program.
     * Programs may have multiple data sinks. A DataSet may also have multiple consumers (data sinks
     * or transformations) at the same time.
     *
     * @param outputFormat The OutputFormat to process the DataSet.
     * @return The DataSink that processes the DataSet.
     * @see OutputFormat
     * @see DataSink
     */
    public DataSink output(OutputFormat outputFormat) {
        Preconditions.checkNotNull(outputFormat);

        // configure the type if needed
        if (outputFormat instanceof InputTypeConfigurable) {
            ((InputTypeConfigurable) outputFormat).setInputType(getType(), context.getConfig());
        }

        DataSink sink = new DataSink<>(this, outputFormat, getType());
        this.context.registerDataSink(sink);
        return sink;
    }

    // --------------------------------------------------------------------------------------------
    //  Utilities
    // --------------------------------------------------------------------------------------------

    protected static void checkSameExecutionContext(DataSet set1, DataSet set2) {
        if (set1.getExecutionEnvironment() != set2.getExecutionEnvironment()) {
            throw new IllegalArgumentException("The two inputs have different execution contexts.");
        }
    }
}