All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.emmalanguage.api.flink.FlinkOps.scala Maven / Gradle / Ivy
/*
* Copyright © 2014 TU Berlin ([email protected] )
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.emmalanguage
package api.flink
import api._
import api.alg._
import api.backend.ComprehensionCombinators
import api.backend.Runtime
import org.apache.flink.api.java.io.TypeSerializerInputFormat
import org.apache.flink.api.java.io.TypeSerializerOutputFormat
import org.apache.flink.api.scala.DataSet
import org.apache.flink.api.scala.{ExecutionEnvironment => FlinkEnv}
import org.apache.flink.core.fs.FileSystem
import java.net.URI
/** Flink backend operators. */
object FlinkOps extends ComprehensionCombinators[FlinkEnv] with Runtime[FlinkEnv] {
import FlinkDataSet.typeInfoForType
import FlinkDataSet.wrap
import Meta.Projections._
// ---------------------------------------------------------------------------
// ComprehensionCombinators
// ---------------------------------------------------------------------------
def cross[A: Meta, B: Meta](
xs: DataBag[A], ys: DataBag[B]
)(implicit flink: FlinkEnv): DataBag[(A, B)] = {
val datasetOf = new DataSetExtractor(flink)
(xs, ys) match {
case (datasetOf(xsDS), datasetOf(ysDS)) => xsDS cross ysDS
}
}
def equiJoin[A: Meta, B: Meta, K: Meta](
keyx: A => K, keyy: B => K)(xs: DataBag[A], ys: DataBag[B]
)(implicit flink: FlinkEnv): DataBag[(A, B)] = {
val datasetOf = new DataSetExtractor(flink)
(xs, ys) match {
case (datasetOf(xsDS), datasetOf(ysDS)) =>
(xsDS join ysDS) where keyx equalTo keyy
}
}
private class DataSetExtractor(flink: FlinkEnv) {
def unapply[A: Meta](bag: DataBag[A]): Option[DataSet[A]] = bag match {
case bag: FlinkDataSet[A] => Some(bag.rep)
case _ => Some(flink.fromCollection(bag.collect()))
}
}
// ---------------------------------------------------------------------------
// Runtime
// ---------------------------------------------------------------------------
def cache[A: Meta](xs: DataBag[A])(implicit flink: FlinkEnv): DataBag[A] =
xs match {
case xs: FlinkDataSet[A] =>
val sinkName = sink(xs.rep)
xs.env.execute(s"emma-cache-$sinkName")
source[A](sinkName)
case _ => xs
}
def foldGroup[A: Meta, B: Meta, K: Meta](
xs: DataBag[A], key: A => K, alg: Alg[A, B]
)(implicit flink: FlinkEnv): DataBag[Group[K, B]] = xs match {
case xs: FlinkDataSet[A] => xs.rep
.map(x => Group(key(x), alg.init(x))).groupBy("key")
.reduce((x, y) => Group(x.key, alg.plus(x.values, y.values)))
}
private def sink[A: Meta](xs: DataSet[A])(implicit flink: FlinkEnv): String = {
val typeInfo = typeInfoForType[A]
val tempName = tempNames.next()
val outFmt = new TypeSerializerOutputFormat[A]
outFmt.setInputType(typeInfo, flink.getConfig)
outFmt.setSerializer(typeInfo.createSerializer(flink.getConfig))
xs.write(outFmt, tempPath(tempName), FileSystem.WriteMode.OVERWRITE)
tempName
}
private def source[A: Meta](fileName: String)(implicit flink: FlinkEnv): DataSet[A] = {
val filePath = tempPath(fileName)
val typeInfo = typeInfoForType[A]
val inFmt = new TypeSerializerInputFormat[A](typeInfo)
inFmt.setFilePath(filePath)
flink.readFile(inFmt, filePath)
}
private val tempBase =
new URI(System.getProperty("emma.flink.temp-base", "file:///tmp/emma/flink-temp/"))
private[emmalanguage] val tempNames = Stream.iterate(0)(_ + 1)
.map(i => f"dataflow$i%03d")
.toIterator
private[emmalanguage] def tempPath(tempName: String): String =
tempBase.resolve(tempName).toURL.toString
}