cz.seznam.euphoria.spark.SparkFlowTranslator Maven / Gradle / Ivy
/**
* Copyright 2016 Seznam.cz, a.s.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cz.seznam.euphoria.spark;
import com.google.common.base.Preconditions;
import cz.seznam.euphoria.core.client.dataset.windowing.WindowedElement;
import cz.seznam.euphoria.core.client.flow.Flow;
import cz.seznam.euphoria.core.client.functional.UnaryPredicate;
import cz.seznam.euphoria.core.client.graph.DAG;
import cz.seznam.euphoria.core.client.graph.Node;
import cz.seznam.euphoria.core.client.io.DataSink;
import cz.seznam.euphoria.core.client.operator.FlatMap;
import cz.seznam.euphoria.core.client.operator.Operator;
import cz.seznam.euphoria.core.client.operator.ReduceByKey;
import cz.seznam.euphoria.core.client.operator.ReduceStateByKey;
import cz.seznam.euphoria.core.client.operator.Repartition;
import cz.seznam.euphoria.core.client.operator.Union;
import cz.seznam.euphoria.core.executor.FlowUnfolder;
import cz.seznam.euphoria.hadoop.output.DataSinkOutputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
/**
* Translates given {@link Flow} into Spark execution environment
*/
public class SparkFlowTranslator {
private static class Translation> {
final SparkOperatorTranslator translator;
final UnaryPredicate accept;
private Translation(
SparkOperatorTranslator translator, UnaryPredicate accept) {
this.translator = Objects.requireNonNull(translator);
this.accept = accept;
}
static > void set(
Map idx,
Class type, SparkOperatorTranslator translator)
{
set(idx, type, translator, null);
}
static > void set(
Map idx,
Class type, SparkOperatorTranslator translator, UnaryPredicate accept)
{
idx.put(type, new Translation<>(translator, accept));
}
}
/* mapping of Euphoria operators to corresponding Flink transformations */
private final Map translations = new IdentityHashMap<>();
private final JavaSparkContext sparkEnv;
public SparkFlowTranslator(JavaSparkContext sparkEnv) {
this.sparkEnv = Objects.requireNonNull(sparkEnv);
// basic operators
Translation.set(translations, FlowUnfolder.InputOperator.class, new InputTranslator());
Translation.set(translations, FlatMap.class, new FlatMapTranslator());
Translation.set(translations, Repartition.class, new RepartitionTranslator());
Translation.set(translations, ReduceStateByKey.class, new ReduceStateByKeyTranslator());
Translation.set(translations, Union.class, new UnionTranslator());
// derived operators
Translation.set(translations, ReduceByKey.class, new ReduceByKeyTranslator(),
ReduceByKeyTranslator::wantTranslate);
}
@SuppressWarnings("unchecked")
public List> translateInto(Flow flow) {
// transform flow to acyclic graph of supported operators
DAG> dag = flowToDag(flow);
SparkExecutorContext executorContext =
new SparkExecutorContext(sparkEnv, dag);
// translate each operator to proper Spark transformation
dag.traverse().map(Node::get).forEach(op -> {
Translation tx = translations.get(op.getClass());
if (tx == null) {
throw new UnsupportedOperationException(
"Operator " + op.getClass().getSimpleName() + " not supported");
}
// ~ verify the flowToDag translation
Preconditions.checkState(
tx.accept == null || Boolean.TRUE.equals(tx.accept.apply(op)));
JavaRDD> out = tx.translator.translate(op, executorContext);
// save output of current operator to context
executorContext.setOutput(op, out);
});
// process all sinks in the DAG (leaf nodes)
final List> sinks = new ArrayList<>();
dag.getLeafs()
.stream()
.map(Node::get)
.filter(op -> op.output().getOutputSink() != null)
.forEach(op -> {
final DataSink> sink = op.output().getOutputSink();
sinks.add(sink);
JavaRDD sparkOutput =
Objects.requireNonNull((JavaRDD) executorContext.getOutput(op));
// unwrap data from WindowedElement
JavaPairRDD unwrapped =
sparkOutput.mapToPair(el -> new Tuple2<>(NullWritable.get(), el.getElement()));
try {
Configuration conf = DataSinkOutputFormat.configure(
new Configuration(),
(DataSink) sink);
conf.set(JobContext.OUTPUT_FORMAT_CLASS_ATTR,
DataSinkOutputFormat.class.getName());
// FIXME blocking op
unwrapped.saveAsNewAPIHadoopDataset(conf);
} catch (IOException e) {
throw new RuntimeException();
}
});
return sinks;
}
/**
* A functor to accept operators for translation if the operator's
* type equals a specified, fixed type. An optional custom "accept"
* function can be provided to further tweak the decision whether
* a particular operator instance is to be accepted for translation
* or not.
*
* @param the fixed operator type accepted
*/
public static final class TranslateAcceptor
implements UnaryPredicate> {
final Class type;
final UnaryPredicate accept;
public TranslateAcceptor(Class type) {
this (type, null);
}
public TranslateAcceptor(Class type, UnaryPredicate accept) {
this.type = Objects.requireNonNull(type);
this.accept = accept;
}
@Override
public Boolean apply(Operator, ?> operator) {
return type == operator.getClass()
&& (accept == null || accept.apply(type.cast(operator)));
}
}
/**
* Converts a {@link Flow} into a {@link DAG} of Flink specific {@link Operator}s.
*
* Invokes {@link #getAcceptors()} to determine which user provided
* operators to accept for direct translation, i.e. which to leave in
* the resulting DAG without expanding them to their {@link Operator#getBasicOps()}.
*
* @param flow the user defined flow to translate
*
* @return a DAG representing the specified flow; never {@code null}
*
* @throws IllegalStateException if validation of the specified flow failed
* for some reason
*/
protected DAG> flowToDag(Flow flow) {
// ~ get acceptors for translation
Map> acceptors =
buildAcceptorsIndex(getAcceptors());
// ~ now, unfold the flow based on the specified acceptors
return FlowUnfolder.unfold(flow, operator -> {
// accept the operator if any of the specified acceptors says so
Collection accs = acceptors.get(operator.getClass());
if (accs != null && !accs.isEmpty()) {
for (TranslateAcceptor acc : accs) {
if (acc.apply(operator)) {
return true;
}
}
}
return false;
});
}
/**
* Helper method to build an index over the given acceptors by
* {@link TranslateAcceptor#type}.
*/
private Map>
buildAcceptorsIndex(Collection accs) {
IdentityHashMap> idx =
new IdentityHashMap<>(accs.size());
for (TranslateAcceptor> acc : accs) {
Collection cac =
idx.computeIfAbsent(acc.type, k -> new ArrayList<>());
cac.add(acc);
}
return idx;
}
protected Collection getAcceptors() {
return translations.entrySet().stream()
.map(e -> new TranslateAcceptor(e.getKey(), e.getValue().accept))
.collect(Collectors.toList());
}
}