All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.api.scala.utils.package.scala Maven / Gradle / Ivy

There is a newer version: 0.10.2-hadoop1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.api.scala

import org.apache.flink.api.common.typeinfo.{BasicTypeInfo, TypeInformation}
import org.apache.flink.api.java.Utils
import org.apache.flink.api.java.utils.{DataSetUtils => jutils}

import _root_.scala.language.implicitConversions
import _root_.scala.reflect.ClassTag

package object utils {

  /**
   * This class provides simple utility methods for zipping elements in a data set with an index
   * or with a unique identifier, sampling elements from a data set.
   *
   * @param self Data Set
   */
  implicit class DataSetUtils[T: TypeInformation : ClassTag](val self: DataSet[T]) {

    /**
     * Method that takes a set of subtask index, total number of elements mappings
     * and assigns ids to all the elements from the input data set.
     *
     * @return a data set of tuple 2 consisting of consecutive ids and initial values.
     */
    def zipWithIndex: DataSet[(Long, T)] = {
      implicit val typeInfo = createTuple2TypeInformation[Long, T](
        BasicTypeInfo.LONG_TYPE_INFO.asInstanceOf[TypeInformation[Long]],
        implicitly[TypeInformation[T]]
      )
      wrap(jutils.zipWithIndex(self.javaSet)).map { t => (t.f0.toLong, t.f1) }
    }

    /**
     * Method that assigns a unique id to all the elements of the input data set.
     *
     * @return a data set of tuple 2 consisting of ids and initial values.
     */
    def zipWithUniqueId: DataSet[(Long, T)] = {
      implicit val typeInfo = createTuple2TypeInformation[Long, T](
        BasicTypeInfo.LONG_TYPE_INFO.asInstanceOf[TypeInformation[Long]],
        implicitly[TypeInformation[T]]
      )
      wrap(jutils.zipWithUniqueId(self.javaSet)).map { t => (t.f0.toLong, t.f1) }
    }

    // --------------------------------------------------------------------------------------------
    //  Sample
    // --------------------------------------------------------------------------------------------
    /**
     * Generate a sample of DataSet by the probability fraction of each element.
     *
     * @param withReplacement Whether element can be selected more than once.
     * @param fraction        Probability that each element is chosen, should be [0,1] without
     *                        replacement, and [0, ∞) with replacement. While fraction is larger
     *                        than 1, the elements are expected to be selected multi times into
     *                        sample on average.
     * @param seed            Random number generator seed.
     * @return The sampled DataSet
     */
    def sample(
        withReplacement: Boolean,
        fraction: Double,
        seed: Long = Utils.RNG.nextLong())
      : DataSet[T] = {
      wrap(jutils.sample(self.javaSet, withReplacement, fraction, seed))
    }

    /**
     * Generate a sample of DataSet with fixed sample size.
     * 

* NOTE: Sample with fixed size is not as efficient as sample with fraction, * use sample with fraction unless you need exact precision. *

* * @param withReplacement Whether element can be selected more than once. * @param numSample The expected sample size. * @param seed Random number generator seed. * @return The sampled DataSet */ def sampleWithSize( withReplacement: Boolean, numSample: Int, seed: Long = Utils.RNG.nextLong()) : DataSet[T] = { wrap(jutils.sampleWithSize(self.javaSet, withReplacement, numSample, seed)) } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy