org.apache.flink.api.scala.DataSet.scala Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.flink.api.scala

import org.apache.flink.api.common.InvalidProgramException
import org.apache.flink.api.common.accumulators.SerializedListAccumulator
import org.apache.flink.api.common.aggregators.Aggregator
import org.apache.flink.api.common.functions._
import org.apache.flink.api.common.io.{FileOutputFormat, OutputFormat}
import org.apache.flink.api.common.operators.Order
import org.apache.flink.api.common.operators.base.JoinOperatorBase.JoinHint
import org.apache.flink.api.common.operators.base.CrossOperatorBase.CrossHint
import org.apache.flink.api.common.operators.base.PartitionOperatorBase.PartitionMethod
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.Utils.CountHelper
import org.apache.flink.api.java.aggregation.Aggregations
import org.apache.flink.api.java.functions.{FirstReducer, KeySelector}
import org.apache.flink.api.java.io.{DiscardingOutputFormat, PrintingOutputFormat, TextOutputFormat}
import org.apache.flink.api.java.operators.Keys.ExpressionKeys
import org.apache.flink.api.java.operators._
import org.apache.flink.api.java.operators.join.JoinType
import org.apache.flink.api.java.{DataSet => JavaDataSet, Utils}
import org.apache.flink.api.scala.operators.{ScalaAggregateOperator, ScalaCsvOutputFormat}
import org.apache.flink.configuration.Configuration
import org.apache.flink.core.fs.{FileSystem, Path}
import org.apache.flink.util.{AbstractID, Collector}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

/**
 * The DataSet, the basic abstraction of Flink. This represents a collection of elements of a
 * specific type `T`. The operations in this class can be used to create new DataSets and to combine
 * two DataSets. The methods of [[ExecutionEnvironment]] can be used to create a DataSet from an
 * external source, such as files in HDFS. The `write*` methods can be used to write the elements
 * to storage.
 *
 * All operations accept either a lambda function or an operation-specific function object for
 * specifying the operation. For example, using a lambda:
 * {{{
 *   val input: DataSet[String] = ...
 *   val mapped = input flatMap { _.split(" ") }
 * }}}
 * And using a `MapFunction`:
 * {{{
 *   val input: DataSet[String] = ...
 *   val mapped = input flatMap { new FlatMapFunction[String, String] {
 *     def flatMap(in: String, out: Collector[String]): Unit = {
 *       in.split(" ") foreach { out.collect(_) }
 *     }
 *   }
 * }}}
 *
 * A rich function can be used when more control is required, for example for accessing the
 * `RuntimeContext`. The rich function for `flatMap` is `RichFlatMapFunction`, all other functions
 * are named similarly. All functions are available in package
 * `org.apache.flink.api.common.functions`.
 *
 * The elements are partitioned depending on the parallelism of the
 * [[ExecutionEnvironment]] or of one specific DataSet.
 *
 * Most of the operations have an implicit [[TypeInformation]] parameter. This is supplied by
 * an implicit conversion in the `flink.api.scala` Package. For this to work,
 * [[createTypeInformation]] needs to be imported. This is normally achieved with a
 * {{{
 *   import org.apache.flink.api.scala._
 * }}}
 *
 * @tparam T The type of the DataSet, i.e., the type of the elements of the DataSet.
 */
class DataSet[T: ClassTag](set: JavaDataSet[T]) {
  require(set != null, "Java DataSet must not be null.")

  /**
   * Returns the TypeInformation for the elements of this DataSet.
   */
  def getType(): TypeInformation[T] = set.getType()

  /**
   * Returns the execution environment associated with the current DataSet.
   * @return associated execution environment
   */
  def getExecutionEnvironment: ExecutionEnvironment =
    new ExecutionEnvironment(set.getExecutionEnvironment)

  /**
   * Returns the underlying Java DataSet.
   */
  private[flink] def javaSet: JavaDataSet[T] = set

  /* This code is originally from the Apache Spark project. */
  /**
   * Clean a closure to make it ready to serialized and send to tasks
   * (removes unreferenced variables in $outer's, updates REPL variables)
   * If checkSerializable is set, clean will also proactively
   * check to see if f is serializable and throw a SparkException
   * if not.
   *
   * @param f the closure to clean
   * @param checkSerializable whether or not to immediately check f for serializability
   * @throws InvalidProgramException if checkSerializable is set but f
   *          is not serializable
   */
  private[flink] def clean[F <: AnyRef](f: F, checkSerializable: Boolean = true): F = {
    if (set.getExecutionEnvironment.getConfig.isClosureCleanerEnabled) {
      ClosureCleaner.clean(f, checkSerializable)
    }
    ClosureCleaner.ensureSerializable(f)
    f
  }

  // --------------------------------------------------------------------------------------------
  //  General methods
  // --------------------------------------------------------------------------------------------
  // These are actually implemented in subclasses of the Java DataSet but we perform checking
  // here and just pass through the calls to make everything much simpler.

  /**
   * Sets the name of the DataSet. This will appear in logs and graphical
   * representations of the execution graph.
   */
  def name(name: String) = {
    javaSet match {
      case ds: DataSource[_] => ds.name(name)
      case op: Operator[_, _] => op.name(name)
      case di: DeltaIterationResultSet[_, _] => di.getIterationHead.name(name)
      case _ =>
        throw new UnsupportedOperationException("Operator " + javaSet.toString +
          " cannot have a name.")
    }
    // return this for chaining methods calls
    this
  }

  /**
   * Sets the parallelism of this operation. This must be greater than 1.
   */
  def setParallelism(parallelism: Int) = {
    javaSet match {
      case ds: DataSource[_] => ds.setParallelism(parallelism)
      case op: Operator[_, _] => op.setParallelism(parallelism)
      case di: DeltaIterationResultSet[_, _] => di.getIterationHead.parallelism(parallelism)
      case _ =>
        throw new UnsupportedOperationException("Operator " + javaSet.toString + " cannot have " +
          "parallelism.")
    }
    this
  }

  /**
   * Returns the parallelism of this operation.
   */
  def getParallelism: Int = javaSet match {
    case ds: DataSource[_] => ds.getParallelism
    case op: Operator[_, _] => op.getParallelism
    case _ =>
      throw new UnsupportedOperationException("Operator " + javaSet.toString + " does not have " +
        "parallelism.")
  }

  /**
   * Registers an [[org.apache.flink.api.common.aggregators.Aggregator]]
   * for the iteration. Aggregators can be used to maintain simple statistics during the
   * iteration, such as number of elements processed. The aggregators compute global aggregates:
   * After each iteration step, the values are globally aggregated to produce one aggregate that
   * represents statistics across all parallel instances.
   * The value of an aggregator can be accessed in the next iteration.
   *
   * Aggregators can be accessed inside a function via
   * [[org.apache.flink.api.common.functions.AbstractRichFunction#getIterationRuntimeContext]].
   *
   * @param name The name under which the aggregator is registered.
   * @param aggregator The aggregator class.
   */
  def registerAggregator(name: String, aggregator: Aggregator[_]): DataSet[T] = {
    javaSet match {
      case di: DeltaIterationResultSet[_, _] =>
        di.getIterationHead.registerAggregator(name, aggregator)
      case _ =>
        throw new UnsupportedOperationException("Operator " + javaSet.toString + " cannot have " +
          "aggregators.")
    }
    this
  }

  /**
   * Adds a certain data set as a broadcast set to this operator. Broadcast data sets are
   * available at all
   * parallel instances of this operator. A broadcast data set is registered under a certain
   * name, and can be
   * retrieved under that name from the operators runtime context via
   * `org.apache.flink.api.common.functions.RuntimeContext.getBroadCastVariable(String)`
   *
   * The runtime context itself is available in all UDFs via
   * `org.apache.flink.api.common.functions.AbstractRichFunction#getRuntimeContext()`
   *
   * @param data The data set to be broadcasted.
   * @param name The name under which the broadcast data set retrieved.
   * @return The operator itself, to allow chaining function calls.
   */
  def withBroadcastSet(data: DataSet[_], name: String) = {
    javaSet match {
      case udfOp: UdfOperator[_] => udfOp.withBroadcastSet(data.javaSet, name)
      case _ =>
        throw new UnsupportedOperationException("Operator " + javaSet.toString + " cannot have " +
          "broadcast variables.")
    }
    this
  }

  def withForwardedFields(forwardedFields: String*) = {
    javaSet match {
      case op: SingleInputUdfOperator[_, _, _] => op.withForwardedFields(forwardedFields: _*)
      case _ =>
        throw new UnsupportedOperationException("Cannot specify forwarded fields for Operator " +
          javaSet.toString + ".")
    }
    this
  }

  def withForwardedFieldsFirst(forwardedFields: String*) = {
    javaSet match {
      case op: TwoInputUdfOperator[_, _, _, _] => op.withForwardedFieldsFirst(forwardedFields: _*)
      case _ =>
        throw new UnsupportedOperationException("Cannot specify forwarded fields for Operator " +
          javaSet.toString + ".")
    }
    this
  }

  def withForwardedFieldsSecond(forwardedFields: String*) = {
    javaSet match {
      case op: TwoInputUdfOperator[_, _, _, _] => op.withForwardedFieldsSecond(forwardedFields: _*)
      case _ =>
        throw new UnsupportedOperationException("Cannot specify forwarded fields for Operator " +
          javaSet.toString + ".")
    }
    this
  }

  def withParameters(parameters: Configuration): DataSet[T] = {
    javaSet match {
      case udfOp: UdfOperator[_] => udfOp.withParameters(parameters)
      case source: DataSource[_] => source.withParameters(parameters)
      case _ =>
        throw new UnsupportedOperationException("Operator " + javaSet.toString 
            + " cannot have parameters")
    }
    this
  }

  // --------------------------------------------------------------------------------------------
  //  Filter & Transformations
  // --------------------------------------------------------------------------------------------

  /**
   * Creates a new DataSet by applying the given function to every element of this DataSet.
   */
  def map[R: TypeInformation: ClassTag](mapper: MapFunction[T, R]): DataSet[R] = {
    if (mapper == null) {
      throw new NullPointerException("Map function must not be null.")
    }
    wrap(new MapOperator[T, R](javaSet,
      implicitly[TypeInformation[R]],
      mapper,
      getCallLocationName()))
  }

  /**
   * Creates a new DataSet by applying the given function to every element of this DataSet.
   */
  def map[R: TypeInformation: ClassTag](fun: T => R): DataSet[R] = {
    if (fun == null) {
      throw new NullPointerException("Map function must not be null.")
    }
    val mapper = new MapFunction[T, R] {
      val cleanFun = clean(fun)
      def map(in: T): R = cleanFun(in)
    }
    wrap(new MapOperator[T, R](javaSet,
      implicitly[TypeInformation[R]],
      mapper,
      getCallLocationName()))
  }

  /**
   * Creates a new DataSet by applying the given function to each parallel partition of the
   * DataSet.
   *
   * This function is intended for operations that cannot transform individual elements and
   * requires no grouping of elements. To transform individual elements,
   * the use of [[map]] and [[flatMap]] is preferable.
   */
  def mapPartition[R: TypeInformation: ClassTag](
      partitionMapper: MapPartitionFunction[T, R]): DataSet[R] = {
    if (partitionMapper == null) {
      throw new NullPointerException("MapPartition function must not be null.")
    }
    wrap(new MapPartitionOperator[T, R](javaSet,
      implicitly[TypeInformation[R]],
      partitionMapper,
      getCallLocationName()))
  }

  /**
   * Creates a new DataSet by applying the given function to each parallel partition of the
   * DataSet.
   *
   * This function is intended for operations that cannot transform individual elements and
   * requires no grouping of elements. To transform individual elements,
   * the use of [[map]] and [[flatMap]] is preferable.
   */
  def mapPartition[R: TypeInformation: ClassTag](
      fun: (Iterator[T], Collector[R]) => Unit): DataSet[R] = {
    if (fun == null) {
      throw new NullPointerException("MapPartition function must not be null.")
    }
    val partitionMapper = new MapPartitionFunction[T, R] {
      val cleanFun = clean(fun)
      def mapPartition(in: java.lang.Iterable[T], out: Collector[R]) {
        cleanFun(in.iterator().asScala, out)
      }
    }
    wrap(new MapPartitionOperator[T, R](javaSet,
      implicitly[TypeInformation[R]],
      partitionMapper,
      getCallLocationName()))
  }

  /**
   * Creates a new DataSet by applying the given function to each parallel partition of the
   * DataSet.
   *
   * This function is intended for operations that cannot transform individual elements and
   * requires no grouping of elements. To transform individual elements,
   * the use of [[map]] and [[flatMap]] is preferable.
   */
  def mapPartition[R: TypeInformation: ClassTag](
      fun: (Iterator[T]) => TraversableOnce[R]): DataSet[R] = {
    if (fun == null) {
      throw new NullPointerException("MapPartition function must not be null.")
    }
    val partitionMapper = new MapPartitionFunction[T, R] {
      val cleanFun = clean(fun)
      def mapPartition(in: java.lang.Iterable[T], out: Collector[R]) {
        cleanFun(in.iterator().asScala) foreach out.collect
      }
    }
    wrap(new MapPartitionOperator[T, R](javaSet,
      implicitly[TypeInformation[R]],
      partitionMapper,
      getCallLocationName()))
  }

  /**
   * Creates a new DataSet by applying the given function to every element and flattening
   * the results.
   */
  def flatMap[R: TypeInformation: ClassTag](flatMapper: FlatMapFunction[T, R]): DataSet[R] = {
    if (flatMapper == null) {
      throw new NullPointerException("FlatMap function must not be null.")
    }
    wrap(new FlatMapOperator[T, R](javaSet,
      implicitly[TypeInformation[R]],
      flatMapper,
      getCallLocationName()))
  }

  /**
   * Creates a new DataSet by applying the given function to every element and flattening
   * the results.
   */
  def flatMap[R: TypeInformation: ClassTag](fun: (T, Collector[R]) => Unit): DataSet[R] = {
    if (fun == null) {
      throw new NullPointerException("FlatMap function must not be null.")
    }
    val flatMapper = new FlatMapFunction[T, R] {
      val cleanFun = clean(fun)
      def flatMap(in: T, out: Collector[R]) { cleanFun(in, out) }
    }
    wrap(new FlatMapOperator[T, R](javaSet,
      implicitly[TypeInformation[R]],
      flatMapper,
      getCallLocationName()))
  }

  /**
   * Creates a new DataSet by applying the given function to every element and flattening
   * the results.
   */
  def flatMap[R: TypeInformation: ClassTag](fun: T => TraversableOnce[R]): DataSet[R] = {
    if (fun == null) {
      throw new NullPointerException("FlatMap function must not be null.")
    }
    val flatMapper = new FlatMapFunction[T, R] {
      val cleanFun = clean(fun)
      def flatMap(in: T, out: Collector[R]) { cleanFun(in) foreach out.collect }
    }
    wrap(new FlatMapOperator[T, R](javaSet,
      implicitly[TypeInformation[R]],
      flatMapper,
      getCallLocationName()))
  }

  /**
   * Creates a new DataSet that contains only the elements satisfying the given filter predicate.
   */
  def filter(filter: FilterFunction[T]): DataSet[T] = {
    if (filter == null) {
      throw new NullPointerException("Filter function must not be null.")
    }
    wrap(new FilterOperator[T](javaSet, filter, getCallLocationName()))
  }

  /**
   * Creates a new DataSet that contains only the elements satisfying the given filter predicate.
   */
  def filter(fun: T => Boolean): DataSet[T] = {
    if (fun == null) {
      throw new NullPointerException("Filter function must not be null.")
    }
    val filter = new FilterFunction[T] {
      val cleanFun = clean(fun)
      def filter(in: T) = cleanFun(in)
    }
    wrap(new FilterOperator[T](javaSet, filter, getCallLocationName()))
  }

  // --------------------------------------------------------------------------------------------
  //  Non-grouped aggregations
  // --------------------------------------------------------------------------------------------

  /**
   * Creates a new [[DataSet]] by aggregating the specified tuple field using the given aggregation
   * function. Since this is not a keyed DataSet the aggregation will be performed on the whole
   * collection of elements.
   *
   * This only works on Tuple DataSets.
   */
  def aggregate(agg: Aggregations, field: Int): AggregateDataSet[T] = {
    new AggregateDataSet(new ScalaAggregateOperator[T](javaSet, agg, field))
  }

  /**
   * Creates a new [[DataSet]] by aggregating the specified field using the given aggregation
   * function. Since this is not a keyed DataSet the aggregation will be performed on the whole
   * collection of elements.
   *
   * This only works on CaseClass DataSets.
   */
  def aggregate(agg: Aggregations, field: String): AggregateDataSet[T] = {
    val fieldIndex = fieldNames2Indices(javaSet.getType, Array(field))(0)

    new AggregateDataSet(new ScalaAggregateOperator[T](javaSet, agg, fieldIndex))
  }

  /**
   * Syntactic sugar for [[aggregate]] with `SUM`
   */
  def sum(field: Int) = {
    aggregate(Aggregations.SUM, field)
  }

  /**
   * Syntactic sugar for [[aggregate]] with `MAX`
   */
  def max(field: Int) = {
    aggregate(Aggregations.MAX, field)
  }

  /**
   * Syntactic sugar for [[aggregate]] with `MIN`
   */
  def min(field: Int) = {
    aggregate(Aggregations.MIN, field)
  }

  /**
   * Syntactic sugar for [[aggregate]] with `SUM`
   */
  def sum(field: String) = {
    aggregate(Aggregations.SUM, field)
  }

  /**
   * Syntactic sugar for [[aggregate]] with `MAX`
   */
  def max(field: String) = {
    aggregate(Aggregations.MAX, field)
  }

  /**
   * Syntactic sugar for [[aggregate]] with `MIN`
   */
  def min(field: String) = {
    aggregate(Aggregations.MIN, field)
  }

  /**
   * Convenience method to get the count (number of elements) of a DataSet
   *
   * @return A long integer that represents the number of elements in the set
   *
   * @see org.apache.flink.api.java.Utils.CountHelper
   */
  @throws(classOf[Exception])
  def count(): Long = {
    val id = new AbstractID().toString
    javaSet.flatMap(new CountHelper[T](id)).output(new DiscardingOutputFormat[java.lang.Long])
    val res = getExecutionEnvironment.execute()
    res.getAccumulatorResult[Long](id)
  }

  /**
   * Convenience method to get the elements of a DataSet as a List
   * As DataSet can contain a lot of data, this method should be used with caution.
   *
   * @return A Seq containing the elements of the DataSet
   *
   * @see org.apache.flink.api.java.Utils.CollectHelper
   */
  @throws(classOf[Exception])
  def collect(): Seq[T] = {
    val id = new AbstractID().toString
    val serializer = getType().createSerializer(getExecutionEnvironment.getConfig)
    
    javaSet.flatMap(new Utils.CollectHelper[T](id, serializer))
           .output(new DiscardingOutputFormat[T])
    
    val res = getExecutionEnvironment.execute()

    val accResult: java.util.ArrayList[Array[Byte]] = res.getAccumulatorResult(id)

    try {
      SerializedListAccumulator.deserializeList(accResult, serializer).asScala
    }
    catch {
      case e: ClassNotFoundException => {
        throw new RuntimeException("Cannot find type class of collected data type.", e)
      }
      case e: java.io.IOException => {
        throw new RuntimeException("Serialization error while deserializing collected data", e)
      }
    }
  }

  /**
   * Creates a new [[DataSet]] by merging the elements of this DataSet using an associative reduce
   * function.
   */
  def reduce(reducer: ReduceFunction[T]): DataSet[T] = {
    if (reducer == null) {
      throw new NullPointerException("Reduce function must not be null.")
    }
    wrap(new ReduceOperator[T](javaSet, reducer, getCallLocationName()))
  }

  /**
   * Creates a new [[DataSet]] by merging the elements of this DataSet using an associative reduce
   * function.
   */
  def reduce(fun: (T, T) => T): DataSet[T] = {
    if (fun == null) {
      throw new NullPointerException("Reduce function must not be null.")
    }
    val reducer = new ReduceFunction[T] {
      val cleanFun = clean(fun)
      def reduce(v1: T, v2: T) = { cleanFun(v1, v2) }
    }
    wrap(new ReduceOperator[T](javaSet, reducer, getCallLocationName()))
  }

  /**
   * Creates a new [[DataSet]] by passing all elements in this DataSet to the group reduce function.
   * The function can output zero or more elements using the [[Collector]]. The concatenation of the
   * emitted values will form the resulting [[DataSet]].
   */
  def reduceGroup[R: TypeInformation: ClassTag](reducer: GroupReduceFunction[T, R]): DataSet[R] = {
    if (reducer == null) {
      throw new NullPointerException("GroupReduce function must not be null.")
    }
    wrap(new GroupReduceOperator[T, R](javaSet,
      implicitly[TypeInformation[R]],
      reducer,
      getCallLocationName()))
  }

  /**
   * Creates a new [[DataSet]] by passing all elements in this DataSet to the group reduce function.
   * The function can output zero or more elements using the [[Collector]]. The concatenation of the
   * emitted values will form the resulting [[DataSet]].
   */
  def reduceGroup[R: TypeInformation: ClassTag](
      fun: (Iterator[T], Collector[R]) => Unit): DataSet[R] = {
    if (fun == null) {
      throw new NullPointerException("GroupReduce function must not be null.")
    }
    val reducer = new GroupReduceFunction[T, R] {
      val cleanFun = clean(fun)
      def reduce(in: java.lang.Iterable[T], out: Collector[R]) {
        cleanFun(in.iterator().asScala, out)
      }
    }
    wrap(new GroupReduceOperator[T, R](javaSet,
      implicitly[TypeInformation[R]],
      reducer,
      getCallLocationName()))
  }

  /**
   * Creates a new [[DataSet]] by passing all elements in this DataSet to the group reduce function.
   */
  def reduceGroup[R: TypeInformation: ClassTag](fun: (Iterator[T]) => R): DataSet[R] = {
    if (fun == null) {
      throw new NullPointerException("GroupReduce function must not be null.")
    }
    val reducer = new GroupReduceFunction[T, R] {
      val cleanFun = clean(fun)
      def reduce(in: java.lang.Iterable[T], out: Collector[R]) {
        out.collect(cleanFun(in.iterator().asScala))
      }
    }
    wrap(new GroupReduceOperator[T, R](javaSet,
      implicitly[TypeInformation[R]],
      reducer,
      getCallLocationName()))
  }

  /**
   *  Applies a GroupCombineFunction on a grouped [[DataSet]].  A
   *  GroupCombineFunction is similar to a GroupReduceFunction but does not
   *  perform a full data exchange. Instead, the GroupCombineFunction calls
   *  the combine method once per partition for combining a group of
   *  results. This operator is suitable for combining values into an
   *  intermediate format before doing a proper groupReduce where the
   *  data is shuffled across the node for further reduction. The
   *  GroupReduce operator can also be supplied with a combiner by
   *  implementing the RichGroupReduce function. The combine method of
   *  the RichGroupReduce function demands input and output type to be
   *  the same. The GroupCombineFunction, on the other side, can have an
   *  arbitrary output type.
   */
  def combineGroup[R: TypeInformation: ClassTag](
      combiner: GroupCombineFunction[T, R]): DataSet[R] = {
    if (combiner == null) {
      throw new NullPointerException("Combine function must not be null.")
    }
    wrap(new GroupCombineOperator[T, R](javaSet,
      implicitly[TypeInformation[R]],
      combiner,
      getCallLocationName()))
  }

  /**
   *  Applies a GroupCombineFunction on a grouped [[DataSet]].  A
   *  GroupCombineFunction is similar to a GroupReduceFunction but does not
   *  perform a full data exchange. Instead, the GroupCombineFunction calls
   *  the combine method once per partition for combining a group of
   *  results. This operator is suitable for combining values into an
   *  intermediate format before doing a proper groupReduce where the
   *  data is shuffled across the node for further reduction. The
   *  GroupReduce operator can also be supplied with a combiner by
   *  implementing the RichGroupReduce function. The combine method of
   *  the RichGroupReduce function demands input and output type to be
   *  the same. The GroupCombineFunction, on the other side, can have an
   *  arbitrary output type.
   */
  def combineGroup[R: TypeInformation: ClassTag](
      fun: (Iterator[T], Collector[R]) => Unit): DataSet[R] = {
    if (fun == null) {
      throw new NullPointerException("Combine function must not be null.")
    }
    val combiner = new GroupCombineFunction[T, R] {
      val cleanFun = clean(fun)
      def combine(in: java.lang.Iterable[T], out: Collector[R]) {
        cleanFun(in.iterator().asScala, out)
      }
    }
    wrap(new GroupCombineOperator[T, R](javaSet,
      implicitly[TypeInformation[R]],
      combiner,
      getCallLocationName()))
  }

  /**
   * Creates a new DataSet containing the first `n` elements of this DataSet.
   */
  def first(n: Int): DataSet[T] = {
    if (n < 1) {
      throw new InvalidProgramException("Parameter n of first(n) must be at least 1.")
    }
    // Normally reduceGroup expects implicit parameters, supply them manually here.
    reduceGroup(new FirstReducer[T](n))(javaSet.getType, implicitly[ClassTag[T]])
  }

  // --------------------------------------------------------------------------------------------
  //  distinct
  // --------------------------------------------------------------------------------------------
  /**
   * Creates a new DataSet containing the distinct elements of this DataSet. The decision whether
   * two elements are distinct or not is made using the return value of the given function.
   *
   * @param fun The function which extracts the key values from the DataSet on which the
   *            distinction of the DataSet is decided.
   */
  def distinct[K: TypeInformation](fun: T => K): DataSet[T] = {
    val keyExtractor = new KeySelector[T, K] {
      val cleanFun = clean(fun)
      def getKey(in: T) = cleanFun(in)
    }
    wrap(new DistinctOperator[T](
      javaSet,
      new Keys.SelectorFunctionKeys[T, K](
        keyExtractor, javaSet.getType, implicitly[TypeInformation[K]]),
        getCallLocationName()))
  }

  /**
   * Returns a distinct set of this DataSet.
   * 
   * If the input is a composite type (Tuple or Pojo type), distinct is performed on all fields
   * and each field must be a key type.
   */
  def distinct(): DataSet[T] = {
    wrap(new DistinctOperator[T](javaSet, null, getCallLocationName()))
  }

  /**
   * Returns a distinct set of a tuple DataSet using field position keys.
   * 
   * The field position keys specify the fields of Tuples on which the decision is made if
   * two Tuples are distinct or not.
   * 
   * Note: Field position keys can only be specified for Tuple DataSets.
   *
   * @param fields One or more field positions on which the distinction of the DataSet is decided.
   */
  def distinct(fields: Int*): DataSet[T] = {
    wrap(new DistinctOperator[T](
      javaSet,
      new Keys.ExpressionKeys[T](fields.toArray, javaSet.getType, true),
      getCallLocationName()))
  }

  /**
   * Returns a distinct set of this DataSet using expression keys.
   * 
   * The field position keys specify the fields of Tuples or Pojos on which the decision is made
   * if two elements are distinct or not.
   *
   * The field expression keys specify the fields of a
   * [[org.apache.flink.api.common.typeutils.CompositeType]] (e.g., Tuple or Pojo type)
   * on which the decision is made if two elements are distinct or not.
   * In case of a [[org.apache.flink.api.common.typeinfo.AtomicType]], only the
   * wildcard expression ("_") is valid.
   *
   * @param firstField First field position on which the distinction of the DataSet is decided
   * @param otherFields Zero or more field positions on which the distinction of the DataSet
   *                    is decided
   */
  def distinct(firstField: String, otherFields: String*): DataSet[T] = {
    wrap(new DistinctOperator[T](
      javaSet,
      new Keys.ExpressionKeys[T](firstField +: otherFields.toArray, javaSet.getType),
      getCallLocationName()))
  }


  // --------------------------------------------------------------------------------------------
  //  Keyed DataSet
  // --------------------------------------------------------------------------------------------

  /**
   * Creates a [[GroupedDataSet]] which provides operations on groups of elements. Elements are
   * grouped based on the value returned by the given function.
   *
   * This will not create a new DataSet, it will just attach the key function which will be used
   * for grouping when executing a grouped operation.
   */
  def groupBy[K: TypeInformation](fun: T => K): GroupedDataSet[T] = {
    val keyType = implicitly[TypeInformation[K]]
    val keyExtractor = new KeySelector[T, K] {
      val cleanFun = clean(fun)
      def getKey(in: T) = cleanFun(in)
    }
    new GroupedDataSet[T](this,
      new Keys.SelectorFunctionKeys[T, K](keyExtractor, javaSet.getType, keyType))
  }

  /**
   * Creates a [[GroupedDataSet]] which provides operations on groups of elements. Elements are
   * grouped based on the given tuple fields.
   *
   * This will not create a new DataSet, it will just attach the tuple field positions which will be
   * used for grouping when executing a grouped operation.
   *
   * This only works on Tuple DataSets.
   */
  def groupBy(fields: Int*): GroupedDataSet[T] = {
    new GroupedDataSet[T](
      this,
      new Keys.ExpressionKeys[T](fields.toArray, javaSet.getType,false))
  }

  /**
   * Creates a [[GroupedDataSet]] which provides operations on groups of elements. Elements are
   * grouped based on the given fields.
   *
   * This will not create a new DataSet, it will just attach the field names which will be
   * used for grouping when executing a grouped operation.
   *
   */
  def groupBy(firstField: String, otherFields: String*): GroupedDataSet[T] = {
    new GroupedDataSet[T](
      this,
      new Keys.ExpressionKeys[T](firstField +: otherFields.toArray, javaSet.getType))
  }

  //  public UnsortedGrouping groupBy(String... fields) {
  //    new UnsortedGrouping(this, new Keys.ExpressionKeys(fields, getType()));
  //  }

  // --------------------------------------------------------------------------------------------
  //  Joining
  // --------------------------------------------------------------------------------------------

  /**
   * Creates a new DataSet by joining `this` DataSet with the `other` DataSet. To specify the join
   * keys the `where` and `equalTo` methods must be used. For example:
   * {{{
   *   val left: DataSet[(String, Int, Int)] = ...
   *   val right: DataSet[(Int, String, Int)] = ...
   *   val joined = left.join(right).where(0).equalTo(1)
   * }}}
   *
   * The default join result is a DataSet with 2-Tuples of the joined values. In the above example
   * that would be `((String, Int, Int), (Int, String, Int))`. A custom join function can be used
   * if more control over the result is required. This can either be given as a lambda or a
   * custom [[JoinFunction]]. For example:
   * {{{
   *   val left: DataSet[(String, Int, Int)] = ...
   *   val right: DataSet[(Int, String, Int)] = ...
   *   val joined = left.join(right).where(0).equalTo(1) { (l, r) =>
   *     (l._1, r._2)
   *   }
   * }}}
   * A join function with a [[Collector]] can be used to implement a filter directly in the join
   * or to output more than one values. This type of join function does not return a value, instead
   * values are emitted using the collector:
   * {{{
   *   val left: DataSet[(String, Int, Int)] = ...
   *   val right: DataSet[(Int, String, Int)] = ...
   *   val joined = left.join(right).where(0).equalTo(1) {
   *     (l, r, out: Collector[(String, Int)]) =>
   *       if (l._2 > 4) {
   *         out.collect((l._1, r._3))
   *         out.collect((l._1, r._1))
   *       } else {
   *         None
   *       }
   *     }
   * }}}
   */
  def join[O](other: DataSet[O]): UnfinishedJoinOperation[T, O] =
    new UnfinishedJoinOperation(this, other, JoinHint.OPTIMIZER_CHOOSES)

  /**
   * Special [[join]] operation for explicitly telling the system what join strategy to use. If
   * null is given as the join strategy, then the optimizer will pick the strategy.
   */
  def join[O](other: DataSet[O], strategy: JoinHint): UnfinishedJoinOperation[T, O] =
    new UnfinishedJoinOperation(this, other, strategy)
  
  /**
   * Special [[join]] operation for explicitly telling the system that the right side is assumed
   * to be a lot smaller than the left side of the join.
   */
  def joinWithTiny[O](other: DataSet[O]): UnfinishedJoinOperation[T, O] =
    new UnfinishedJoinOperation(this, other, JoinHint.BROADCAST_HASH_SECOND)

  /**
   * Special [[join]] operation for explicitly telling the system that the left side is assumed
   * to be a lot smaller than the right side of the join.
   */
  def joinWithHuge[O](other: DataSet[O]): UnfinishedJoinOperation[T, O] =
    new UnfinishedJoinOperation(this, other, JoinHint.BROADCAST_HASH_FIRST)

  /**
   * Creates a new DataSet by performing a full outer join of `this` DataSet
   * with the `other` DataSet, by combining two elements of two DataSets on
   * key equality.
   * Elements of both DataSets that do not have a matching element on the
   * opposing side are joined with `null` and emitted to the resulting DataSet.
   *
   * To specify the join keys the `where` and `equalTo` methods must be used. For example:
   * {{{
   *   val left: DataSet[(String, Int, Int)] = ...
   *   val right: DataSet[(Int, String, Int)] = ...
   *   val joined = left.fullOuterJoin(right).where(0).equalTo(1)
   * }}}
   *
   * When using an outer join you are required to specify a join function. For example:
   * {{{
   *   val joined = left.fullOuterJoin(right).where(0).equalTo(1) {
   *     (left, right) =>
   *       val a = if (left == null) null else left._1
   *       val b = if (right == null) null else right._3
   *       (a, b)
   *  }
   * }}}
   */
  def fullOuterJoin[O](other: DataSet[O]): UnfinishedOuterJoinOperation[T, O] =
    new UnfinishedOuterJoinOperation(this, other, JoinHint.OPTIMIZER_CHOOSES, JoinType.FULL_OUTER)

  /**
   * Special [[fullOuterJoin]] operation for explicitly telling the system what join strategy to
   * use. If null is given as the join strategy, then the optimizer will pick the strategy.
   */
  def fullOuterJoin[O](other: DataSet[O], strategy: JoinHint): UnfinishedOuterJoinOperation[T, O] =
    strategy match {
      case JoinHint.OPTIMIZER_CHOOSES |
           JoinHint.REPARTITION_SORT_MERGE =>
        new UnfinishedOuterJoinOperation(this, other, strategy, JoinType.FULL_OUTER)
      case _ =>
        throw new InvalidProgramException("Invalid JoinHint for FullOuterJoin: " + strategy)
    }

  /**
   * An outer join on the left side.
   *
   * Elements of the left side (i.e. `this`) that do not have a matching element on the other
   * side are joined with `null` and emitted to the resulting DataSet.
   *
   * @param other The other DataSet with which this DataSet is joined.
   * @return An UnfinishedJoinOperation to continue with the definition of the join transformation
   * @see #fullOuterJoin
   */
  def leftOuterJoin[O](other: DataSet[O]): UnfinishedOuterJoinOperation[T, O] =
    new UnfinishedOuterJoinOperation(this, other, JoinHint.OPTIMIZER_CHOOSES, JoinType.LEFT_OUTER)

  /**
   * An outer join on the left side.
   *
   * Elements of the left side (i.e. `this`) that do not have a matching element on the other
   * side are joined with `null` and emitted to the resulting DataSet.
   *
   * @param other The other DataSet with which this DataSet is joined.
   * @param strategy The strategy that should be used execute the join. If { @code null} is given,
   *                 then the optimizer will pick the join strategy.
   * @return An UnfinishedJoinOperation to continue with the definition of the join transformation
   * @see #fullOuterJoin
   */
  def leftOuterJoin[O](other: DataSet[O], strategy: JoinHint): UnfinishedOuterJoinOperation[T, O] =
    strategy match {
      case JoinHint.OPTIMIZER_CHOOSES |
           JoinHint.REPARTITION_SORT_MERGE |
           JoinHint.REPARTITION_HASH_SECOND |
      JoinHint.BROADCAST_HASH_SECOND =>
        new UnfinishedOuterJoinOperation(this, other, strategy, JoinType.LEFT_OUTER)
      case _ =>
        throw new InvalidProgramException("Invalid JoinHint for LeftOuterJoin: " + strategy)
    }

  /**
   * An outer join on the right side.
   *
   * Elements of the right side (i.e. `other`) that do not have a matching element on `this`
   * side are joined with `null` and emitted to the resulting DataSet.
   *
   * @param other The other DataSet with which this DataSet is joined.
   * @return An UnfinishedJoinOperation to continue with the definition of the join transformation
   * @see #fullOuterJoin
   */
  def rightOuterJoin[O](other: DataSet[O]): UnfinishedOuterJoinOperation[T, O] =
    new UnfinishedOuterJoinOperation(this, other, JoinHint.OPTIMIZER_CHOOSES, JoinType.RIGHT_OUTER)

  /**
   * An outer join on the right side.
   *
   * Elements of the right side (i.e. `other`) that do not have a matching element on `this`
   * side are joined with `null` and emitted to the resulting DataSet.
   *
   * @param other The other DataSet with which this DataSet is joined.
   * @param strategy The strategy that should be used execute the join. If { @code null} is given,
   *                 then the optimizer will pick the join strategy.
   * @return An UnfinishedJoinOperation to continue with the definition of the join transformation
   * @see #fullOuterJoin
   */
  def rightOuterJoin[O](other: DataSet[O], strategy: JoinHint): UnfinishedOuterJoinOperation[T, O] =
    strategy match {
      case JoinHint.OPTIMIZER_CHOOSES |
           JoinHint.REPARTITION_SORT_MERGE |
           JoinHint.REPARTITION_HASH_FIRST |
      JoinHint.BROADCAST_HASH_FIRST =>
        new UnfinishedOuterJoinOperation(this, other, strategy, JoinType.RIGHT_OUTER)
      case _ =>
        throw new InvalidProgramException("Invalid JoinHint for RightOuterJoin: " + strategy)
    }

  // --------------------------------------------------------------------------------------------
  //  Co-Group
  // --------------------------------------------------------------------------------------------

  /**
   * For each key in `this` DataSet and the `other` DataSet, create a tuple containing a list
   * of elements for that key from both DataSets. To specify the join keys the `where` and
   * `isEqualTo` methods must be used. For example:
   * {{{
   *   val left: DataSet[(String, Int, Int)] = ...
   *   val right: DataSet[(Int, String, Int)] = ...
   *   val coGrouped = left.coGroup(right).where(0).isEqualTo(1)
   * }}}
   *
   * A custom coGroup function can be used
   * if more control over the result is required. This can either be given as a lambda or a
   * custom [[CoGroupFunction]]. For example:
   * {{{
   *   val left: DataSet[(String, Int, Int)] = ...
   *   val right: DataSet[(Int, String, Int)] = ...
   *   val coGrouped = left.coGroup(right).where(0).isEqualTo(1) { (l, r) =>
   *     // l and r are of type Iterator
   *     (l.min, r.max)
   *   }
   * }}}
   * A coGroup function with a [[Collector]] can be used to implement a filter directly in the
   * coGroup or to output more than one values. This type of coGroup function does not return a
   * value, instead values are emitted using the collector
   * {{{
   *   val left: DataSet[(String, Int, Int)] = ...
   *   val right: DataSet[(Int, String, Int)] = ...
   *   val coGrouped = left.coGroup(right).where(0).isEqualTo(1) {
   *     (l, r, out: Collector[(String, Int)]) =>
   *       out.collect((l.min, r.max))
   *       out.collect(l.max, r.min))
   *     }
   * }}}
   */
  def coGroup[O: ClassTag](other: DataSet[O]): UnfinishedCoGroupOperation[T, O] =
    new UnfinishedCoGroupOperation(this, other)

  // --------------------------------------------------------------------------------------------
  //  Cross
  // --------------------------------------------------------------------------------------------

  /**
   * Creates a new DataSet by forming the cartesian product of `this` DataSet and the `other`
   * DataSet.
   *
   * The default cross result is a DataSet with 2-Tuples of the combined values. A custom cross
   * function can be used if more control over the result is required. This can either be given as
   * a lambda or a custom [[CrossFunction]]. For example:
   * {{{
   *   val left: DataSet[(String, Int, Int)] = ...
   *   val right: DataSet[(Int, String, Int)] = ...
   *   val product = left.cross(right) { (l, r) => (l._2, r._3) }
   *   }
   * }}}
   */
  def cross[O](other: DataSet[O]): CrossDataSet[T, O] =
    CrossDataSet.createCrossOperator(this, other, CrossHint.OPTIMIZER_CHOOSES)

  /**
   * Special [[cross]] operation for explicitly telling the system that the right side is assumed
   * to be a lot smaller than the left side of the cartesian product.
   */
  def crossWithTiny[O](other: DataSet[O]): CrossDataSet[T, O] =
    CrossDataSet.createCrossOperator(this, other, CrossHint.SECOND_IS_SMALL)

  /**
   * Special [[cross]] operation for explicitly telling the system that the left side is assumed
   * to be a lot smaller than the right side of the cartesian product.
   */
  def crossWithHuge[O](other: DataSet[O]): CrossDataSet[T, O] =
    CrossDataSet.createCrossOperator(this, other, CrossHint.FIRST_IS_SMALL)

  // --------------------------------------------------------------------------------------------
  //  Iterations
  // --------------------------------------------------------------------------------------------

  /**
   * Creates a new DataSet by performing bulk iterations using the given step function. The
   * iterations terminate when `maxIterations` iterations have been performed.
   *
   * For example:
   * {{{
   *   val input: DataSet[(String, Int)] = ...
   *   val iterated = input.iterate(5) { previous =>
   *     val next = previous.map { x => (x._1, x._2 + 1) }
   *     next
   *   }
   * }}}
   *
   * This example will simply increase the second field of the tuple by 5.
   */
  def iterate(maxIterations: Int)(stepFunction: (DataSet[T]) => DataSet[T]): DataSet[T] = {
    val iterativeSet =
      new IterativeDataSet[T](
        javaSet.getExecutionEnvironment,
        javaSet.getType,
        javaSet,
        maxIterations)

    val resultSet = stepFunction(wrap(iterativeSet))
    val result = iterativeSet.closeWith(resultSet.javaSet)
    wrap(result)
  }

  /**
   * Creates a new DataSet by performing bulk iterations using the given step function. The first
   * DataSet the step function returns is the input for the next iteration, the second DataSet is
   * the termination criterion. The iterations terminate when either the termination criterion
   * DataSet contains no elements or when `maxIterations` iterations have been performed.
   *
   *  For example:
   * {{{
   *   val input: DataSet[(String, Int)] = ...
   *   val iterated = input.iterateWithTermination(5) { previous =>
   *     val next = previous.map { x => (x._1, x._2 + 1) }
   *     val term = next.filter { _._2 <  3 }
   *     (next, term)
   *   }
   * }}}
   *
   * This example will simply increase the second field of the Tuples until they are no longer
   * smaller than 3.
   */
  def iterateWithTermination(maxIterations: Int)(
    stepFunction: (DataSet[T]) => (DataSet[T], DataSet[_])): DataSet[T] = {
    val iterativeSet =
      new IterativeDataSet[T](
        javaSet.getExecutionEnvironment,
        javaSet.getType,
        javaSet,
        maxIterations)

    val (resultSet, terminationCriterion) = stepFunction(wrap(iterativeSet))
    val result = iterativeSet.closeWith(resultSet.javaSet, terminationCriterion.javaSet)
    wrap(result)
  }

  /**
   * Creates a new DataSet by performing delta (or workset) iterations using the given step
   * function. At the beginning `this` DataSet is the solution set and `workset` is the Workset.
   * The iteration step function gets the current solution set and workset and must output the
   * delta for the solution set and the workset for the next iteration.
   *
   * Note: The syntax of delta iterations are very likely going to change soon.
   */
  def iterateDelta[R: ClassTag](workset: DataSet[R], maxIterations: Int, keyFields: Array[Int])(
      stepFunction: (DataSet[T], DataSet[R]) => (DataSet[T], DataSet[R])) = {
    val key = new ExpressionKeys[T](keyFields, javaSet.getType, false)

    val iterativeSet = new DeltaIteration[T, R](
      javaSet.getExecutionEnvironment,
      javaSet.getType,
      javaSet,
      workset.javaSet,
      key,
      maxIterations)

    val (newSolution, newWorkset) = stepFunction(
      wrap(iterativeSet.getSolutionSet),
      wrap(iterativeSet.getWorkset))
    val result = iterativeSet.closeWith(newSolution.javaSet, newWorkset.javaSet)
    wrap(result)
  }

  /**
   * Creates a new DataSet by performing delta (or workset) iterations using the given step
   * function. At the beginning `this` DataSet is the solution set and `workset` is the Workset.
   * The iteration step function gets the current solution set and workset and must output the
   * delta for the solution set and the workset for the next iteration.
   *
   * Note: The syntax of delta iterations are very likely going to change soon.
   */
  def iterateDelta[R: ClassTag](workset: DataSet[R], maxIterations: Int, keyFields: Array[Int],
                                 solutionSetUnManaged: Boolean)(
    stepFunction: (DataSet[T], DataSet[R]) => (DataSet[T], DataSet[R])) = {
    val key = new ExpressionKeys[T](keyFields, javaSet.getType, false)

    val iterativeSet = new DeltaIteration[T, R](
      javaSet.getExecutionEnvironment,
      javaSet.getType,
      javaSet,
      workset.javaSet,
      key,
      maxIterations)

    iterativeSet.setSolutionSetUnManaged(solutionSetUnManaged)

    val (newSolution, newWorkset) = stepFunction(
      wrap(iterativeSet.getSolutionSet),
      wrap(iterativeSet.getWorkset))
    val result = iterativeSet.closeWith(newSolution.javaSet, newWorkset.javaSet)
    wrap(result)
  }

  /**
   * Creates a new DataSet by performing delta (or workset) iterations using the given step
   * function. At the beginning `this` DataSet is the solution set and `workset` is the Workset.
   * The iteration step function gets the current solution set and workset and must output the
   * delta for the solution set and the workset for the next iteration.
   *
   * Note: The syntax of delta iterations are very likely going to change soon.
   */
  def iterateDelta[R: ClassTag](workset: DataSet[R], maxIterations: Int, keyFields: Array[String])(
    stepFunction: (DataSet[T], DataSet[R]) => (DataSet[T], DataSet[R])) = {

    val key = new ExpressionKeys[T](keyFields, javaSet.getType)
    val iterativeSet = new DeltaIteration[T, R](
      javaSet.getExecutionEnvironment,
      javaSet.getType,
      javaSet,
      workset.javaSet,
      key,
      maxIterations)

    val (newSolution, newWorkset) = stepFunction(
      wrap(iterativeSet.getSolutionSet),
      wrap(iterativeSet.getWorkset))
    val result = iterativeSet.closeWith(newSolution.javaSet, newWorkset.javaSet)
    wrap(result)
  }

  /**
   * Creates a new DataSet by performing delta (or workset) iterations using the given step
   * function. At the beginning `this` DataSet is the solution set and `workset` is the Workset.
   * The iteration step function gets the current solution set and workset and must output the
   * delta for the solution set and the workset for the next iteration.
   *
   * Note: The syntax of delta iterations are very likely going to change soon.
   */
  def iterateDelta[R: ClassTag](workset: DataSet[R], maxIterations: Int, keyFields: Array[String],
                                 solutionSetUnManaged: Boolean)(
    stepFunction: (DataSet[T], DataSet[R]) => (DataSet[T], DataSet[R])) = {

    val key = new ExpressionKeys[T](keyFields, javaSet.getType)
    val iterativeSet = new DeltaIteration[T, R](
      javaSet.getExecutionEnvironment,
      javaSet.getType,
      javaSet,
      workset.javaSet,
      key,
      maxIterations)

    iterativeSet.setSolutionSetUnManaged(solutionSetUnManaged)

    val (newSolution, newWorkset) = stepFunction(
      wrap(iterativeSet.getSolutionSet),
      wrap(iterativeSet.getWorkset))
    val result = iterativeSet.closeWith(newSolution.javaSet, newWorkset.javaSet)
    wrap(result)
  }

  // -------------------------------------------------------------------------------------------
  //  Custom Operators
  // -------------------------------------------------------------------------------------------

  // Keep it out until we have an actual use case for this.
//  /**
//   * Runs a [[CustomUnaryOperation]] on the data set. Custom operations are typically complex
//   * operators that are composed of multiple steps.
//   */
//  def runOperation[R: ClassTag](operation: CustomUnaryOperation[T, R]): DataSet[R] = {
//    require(operation != null, "The custom operator must not be null.")
//    operation.setInput(this.set)
//    wrap(operation.createResult)
//  }

  // --------------------------------------------------------------------------------------------
  //  Union
  // --------------------------------------------------------------------------------------------

  /**
   * Creates a new DataSet containing the elements from both `this` DataSet and the `other`
   * DataSet.
   */
  def union(other: DataSet[T]): DataSet[T] = wrap(new UnionOperator[T](javaSet,
    other.javaSet,
    getCallLocationName()))

  // --------------------------------------------------------------------------------------------
  //  Partitioning
  // --------------------------------------------------------------------------------------------

  /**
   * Hash-partitions a DataSet on the specified tuple field positions.
   *
   * '''important:''' This operation shuffles the whole DataSet over the network and can take
   * significant amount of time.
   */
  def partitionByHash(fields: Int*): DataSet[T] = {
    val op = new PartitionOperator[T](
      javaSet,
      PartitionMethod.HASH,
      new Keys.ExpressionKeys[T](fields.toArray, javaSet.getType, false),
      getCallLocationName())
    wrap(op)
  }

  /**
   * Hash-partitions a DataSet on the specified fields.
   *
   * '''important:''' This operation shuffles the whole DataSet over the network and can take
   * significant amount of time.
   */
  def partitionByHash(firstField: String, otherFields: String*): DataSet[T] = {
    val op = new PartitionOperator[T](
      javaSet,
      PartitionMethod.HASH,
      new Keys.ExpressionKeys[T](firstField +: otherFields.toArray, javaSet.getType),
      getCallLocationName())
    wrap(op)
  }

  /**
   * Partitions a DataSet using the specified key selector function.
   *
   * '''Important:'''This operation shuffles the whole DataSet over the network and can take
   * significant amount of time.
   */
  def partitionByHash[K: TypeInformation](fun: T => K): DataSet[T] = {
    val keyExtractor = new KeySelector[T, K] {
      val cleanFun = clean(fun)
      def getKey(in: T) = cleanFun(in)
    }
    val op = new PartitionOperator[T](
      javaSet,
      PartitionMethod.HASH,
      new Keys.SelectorFunctionKeys[T, K](
        keyExtractor,
        javaSet.getType,
        implicitly[TypeInformation[K]]),
        getCallLocationName())
    wrap(op)
  }
  
  /**
   * Partitions a tuple DataSet on the specified key fields using a custom partitioner.
   * This method takes the key position to partition on, and a partitioner that accepts the key
   * type.
   *  
   * Note: This method works only on single field keys.
   */
  def partitionCustom[K: TypeInformation](partitioner: Partitioner[K], field: Int) : DataSet[T] = {
    val op = new PartitionOperator[T](
      javaSet,
      new Keys.ExpressionKeys[T](Array[Int](field), javaSet.getType, false),
      partitioner,
      implicitly[TypeInformation[K]],
      getCallLocationName())
      
    wrap(op)
  }

  /**
   * Partitions a POJO DataSet on the specified key fields using a custom partitioner.
   * This method takes the key expression to partition on, and a partitioner that accepts the key
   * type.
   * 

   * Note: This method works only on single field keys.
   */
  def partitionCustom[K: TypeInformation](partitioner: Partitioner[K], field: String)
    : DataSet[T] = {
    val op = new PartitionOperator[T](
      javaSet,
      new Keys.ExpressionKeys[T](Array[String](field), javaSet.getType),
      partitioner,
      implicitly[TypeInformation[K]],
      getCallLocationName())
      
    wrap(op)
  }

  /**
   * Partitions a DataSet on the key returned by the selector, using a custom partitioner.
   * This method takes the key selector to get the key to partition on, and a partitioner that
   * accepts the key type.
   * 
   * Note: This method works only on single field keys, i.e. the selector cannot return tuples
   * of fields.
   */
  def partitionCustom[K: TypeInformation](partitioner: Partitioner[K], fun: T => K)
    : DataSet[T] = {
    val keyExtractor = new KeySelector[T, K] {
      val cleanFun = clean(fun)
      def getKey(in: T) = cleanFun(in)
    }
    
    val keyType = implicitly[TypeInformation[K]]

    val op = new PartitionOperator[T](
      javaSet,
      new Keys.SelectorFunctionKeys[T, K](
        keyExtractor,
        javaSet.getType,
        keyType),
      partitioner,
      keyType,
      getCallLocationName())
      
    wrap(op)
  }

  /**
   * Enforces a re-balancing of the DataSet, i.e., the DataSet is evenly distributed over all
   * parallel instances of the
   * following task. This can help to improve performance in case of heavy data skew and compute
   * intensive operations.
   *
   * '''Important:''' This operation shuffles the whole DataSet over the network and can take
   * significant amount of time.
   *
   * @return The rebalanced DataSet.
   */
  def rebalance(): DataSet[T] = {
    wrap(new PartitionOperator[T](javaSet, PartitionMethod.REBALANCE, getCallLocationName()))
  }

  // --------------------------------------------------------------------------------------------
  //  Partition Sorting
  // --------------------------------------------------------------------------------------------

  /**
   * Locally sorts the partitions of the DataSet on the specified field in the specified order.
   * The DataSet can be sorted on multiple fields by chaining sortPartition() calls.
   */
  def sortPartition(field: Int, order: Order): DataSet[T] = {
    new PartitionSortedDataSet[T] (
      new SortPartitionOperator[T](javaSet, field, order, getCallLocationName()))
  }

  /**
   * Locally sorts the partitions of the DataSet on the specified field in the specified order.
   * The DataSet can be sorted on multiple fields by chaining sortPartition() calls.
   */
  def sortPartition(field: String, order: Order): DataSet[T] = {
    new PartitionSortedDataSet[T](
      new SortPartitionOperator[T](javaSet, field, order, getCallLocationName()))
  }

  // --------------------------------------------------------------------------------------------
  //  Result writing
  // --------------------------------------------------------------------------------------------

  /**
   * Writes `this` DataSet to the specified location. This uses [[AnyRef.toString]] on
   * each element.
   */
  def writeAsText(
      filePath: String,
      writeMode: FileSystem.WriteMode = null): DataSink[T] = {
    val tof: TextOutputFormat[T] = new TextOutputFormat[T](new Path(filePath))
    if (writeMode != null) {
      tof.setWriteMode(writeMode)
    }
    output(tof)
  }

  /**
   * Writes `this` DataSet to the specified location as a CSV file.
   *
   * This only works on Tuple DataSets. For individual tuple fields [[AnyRef.toString]] is used.
   */
  def writeAsCsv(
      filePath: String,
      rowDelimiter: String = ScalaCsvOutputFormat.DEFAULT_LINE_DELIMITER,
      fieldDelimiter: String = ScalaCsvOutputFormat.DEFAULT_FIELD_DELIMITER,
      writeMode: FileSystem.WriteMode = null): DataSink[T] = {
    require(javaSet.getType.isTupleType, "CSV output can only be used with Tuple DataSets.")
    val of = new ScalaCsvOutputFormat[Product](new Path(filePath), rowDelimiter, fieldDelimiter)
    if (writeMode != null) {
      of.setWriteMode(writeMode)
    }
    output(of.asInstanceOf[OutputFormat[T]])
  }

  /**
   * Writes `this` DataSet to the specified location using a custom
   * [[org.apache.flink.api.common.io.FileOutputFormat]].
   */
  def write(
      outputFormat: FileOutputFormat[T],
      filePath: String,
      writeMode: FileSystem.WriteMode = null): DataSink[T] = {
    require(filePath != null, "File path must not be null.")
    require(outputFormat != null, "Output format must not be null.")
    outputFormat.setOutputFilePath(new Path(filePath))
    if (writeMode != null) {
      outputFormat.setWriteMode(writeMode)
    }
    output(outputFormat)
  }

  /**
   * Emits `this` DataSet using a custom [[org.apache.flink.api.common.io.OutputFormat]].
   */
  def output(outputFormat: OutputFormat[T]): DataSink[T] = {
    javaSet.output(outputFormat)
  }
  
  /**
   * Prints the elements in a DataSet to the standard output stream [[System.out]] of the
   * JVM that calls the print() method. For programs that are executed in a cluster, this
   * method needs to gather the contents of the DataSet back to the client, to print it
   * there.
   *
   * The string written for each element is defined by the [[AnyRef.toString]] method.
   *
   * This method immediately triggers the program execution, similar to the
   * [[collect()]] and [[count()]] methods.
   */
  def print(): Unit = {
    javaSet.print()
  }
  
  /**
   * Prints the elements in a DataSet to the standard error stream [[System.err]] of the
   * JVM that calls the print() method. For programs that are executed in a cluster, this
   * method needs to gather the contents of the DataSet back to the client, to print it
   * there.
   *
   * The string written for each element is defined by the [[AnyRef.toString]] method.
   *
   * This method immediately triggers the program execution, similar to the
   * [[collect()]] and [[count()]] methods.
   */
  def printToErr(): Unit = {
    javaSet.printToErr()
  }

  /**
   * Writes a DataSet to the standard output streams (stdout) of the TaskManagers that execute
   * the program (or more specifically, the data sink operators). On a typical cluster setup, the
   * data will appear in the TaskManagers' .out files.
   *
   * To print the data to the console or stdout stream of the client process instead, use the
   * [[print()]] method.
   *
   * For each element of the DataSet the result of [[AnyRef.toString()]] is written.
   *
   * @param prefix The string to prefix each line of the output with. This helps identifying outputs
   *               from different printing sinks.   
   * @return The DataSink operator that writes the DataSet.
   */
  def printOnTaskManager(prefix: String): DataSink[T] = {
    javaSet.printOnTaskManager(prefix)
  }
  
  /**
   * *
   * Writes a DataSet to the standard output stream (stdout) with a sink identifier prefixed.
   * This uses [[AnyRef.toString]] on each element.
   * @param sinkIdentifier The string to prefix the output with.
   * 
   * @deprecated Use [[printOnTaskManager(String)]] instead.
   */
  @Deprecated
  @deprecated
  def print(sinkIdentifier: String): DataSink[T] = {
    output(new PrintingOutputFormat[T](sinkIdentifier, false))
  }

  /**
   * Writes a DataSet to the standard error stream (stderr) with a sink identifier prefixed.
   * This uses [[AnyRef.toString]] on each element.
   * @param sinkIdentifier The string to prefix the output with.
   * 
   * @deprecated Use [[printOnTaskManager(String)]] instead.
   */
  @Deprecated
  @deprecated
  def printToErr(sinkIdentifier: String): DataSink[T] = {
      output(new PrintingOutputFormat[T](sinkIdentifier, true))
  }
}