Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.dataartisans.flink.cascading.planner.FlinkFlowStep Maven / Gradle / Ivy
/*
* Copyright 2015 data Artisans GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dataartisans.flink.cascading.planner;
import cascading.flow.FlowElement;
import cascading.flow.FlowException;
import cascading.flow.FlowNode;
import cascading.flow.FlowProcess;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.flow.planner.BaseFlowStep;
import cascading.flow.planner.FlowStepJob;
import cascading.flow.planner.Scope;
import cascading.flow.planner.graph.ElementGraph;
import cascading.flow.planner.graph.Extent;
import cascading.flow.planner.process.FlowNodeGraph;
import cascading.management.state.ClientState;
import cascading.pipe.Boundary;
import cascading.pipe.CoGroup;
import cascading.pipe.GroupBy;
import cascading.pipe.HashJoin;
import cascading.pipe.Merge;
import cascading.pipe.Pipe;
import cascading.pipe.Splice;
import cascading.pipe.joiner.BufferJoin;
import cascading.pipe.joiner.InnerJoin;
import cascading.pipe.joiner.Joiner;
import cascading.pipe.joiner.LeftJoin;
import cascading.property.ConfigDef;
import cascading.tap.Tap;
import cascading.tap.hadoop.io.MultiInputFormat;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import com.dataartisans.flink.cascading.runtime.coGroup.bufferJoin.BufferJoinKeyExtractor;
import com.dataartisans.flink.cascading.runtime.coGroup.bufferJoin.CoGroupBufferReducer;
import com.dataartisans.flink.cascading.runtime.coGroup.regularJoin.CoGroupReducer;
import com.dataartisans.flink.cascading.runtime.coGroup.regularJoin.TupleAppendOuterJoiner;
import com.dataartisans.flink.cascading.runtime.coGroup.regularJoin.TupleOuterJoiner;
import com.dataartisans.flink.cascading.runtime.groupBy.GroupByReducer;
import com.dataartisans.flink.cascading.runtime.hashJoin.NaryHashJoinJoiner;
import com.dataartisans.flink.cascading.runtime.util.FlinkFlowProcess;
import com.dataartisans.flink.cascading.runtime.hashJoin.BinaryHashJoinJoiner;
import com.dataartisans.flink.cascading.runtime.hashJoin.JoinPrepareMapper;
import com.dataartisans.flink.cascading.runtime.hashJoin.TupleAppendCrosser;
import com.dataartisans.flink.cascading.runtime.hashJoin.TupleAppendJoiner;
import com.dataartisans.flink.cascading.runtime.hashJoin.HashJoinMapper;
import com.dataartisans.flink.cascading.runtime.each.EachMapper;
import com.dataartisans.flink.cascading.runtime.sink.TapOutputFormat;
import com.dataartisans.flink.cascading.runtime.source.TapInputFormat;
import com.dataartisans.flink.cascading.runtime.util.IdMapper;
import com.dataartisans.flink.cascading.types.tuple.TupleTypeInfo;
import com.dataartisans.flink.cascading.types.tuplearray.TupleArrayTypeInfo;
import com.dataartisans.flink.cascading.util.FlinkConfigConverter;
import org.apache.flink.api.common.Plan;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.operators.base.JoinOperatorBase.JoinHint;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.GroupReduceOperator;
import org.apache.flink.api.java.operators.JoinOperator;
import org.apache.flink.api.java.operators.Operator;
import org.apache.flink.api.java.operators.PartitionOperator;
import org.apache.flink.api.java.operators.SortPartitionOperator;
import org.apache.flink.api.java.operators.SortedGrouping;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class FlinkFlowStep extends BaseFlowStep {
private static final Logger LOG = LoggerFactory.getLogger(FlinkFlowStep.class);
private ExecutionEnvironment env;
private List classPath;
public FlinkFlowStep(ExecutionEnvironment env, ElementGraph elementGraph, FlowNodeGraph flowNodeGraph, List classPath) {
super(elementGraph, flowNodeGraph);
this.env = env;
this.classPath = classPath;
}
/**
* Configures the Flink program for this step
*/
public Configuration createInitializedConfig( FlowProcess flowProcess, Configuration parentConfig ) {
this.env.getConfig().registerKryoType(Tuple.class);
Configuration config = parentConfig == null ? new JobConf() : HadoopUtil.copyJobConf( parentConfig );
config.set( "cascading.flow.step.num", Integer.toString( getOrdinal() ) );
HadoopUtil.setIsInflow(config);
this.setConfig(config);
return config;
}
protected FlowStepJob createFlowStepJob( ClientState clientState, FlowProcess flowProcess, Configuration initializedStepConfig ) {
this.buildFlinkProgram(flowProcess);
return new FlinkFlowStepJob(clientState, this, initializedStepConfig, classPath);
}
/**
* Method clean removes any temporary files used by this FlowStep instance. It will log any IOExceptions thrown.
*
* @param config of type Configuration
*/
public void clean( Configuration config ) {
}
public ExecutionEnvironment getExecutionEnvironment() {
return this.env;
}
public Plan getFlinkPlan() {
return this.env.createProgramPlan();
}
private void printFlowStep() {
Iterator iterator = getFlowNodeGraph().getTopologicalIterator();
LOG.info("Step Cnt: {} ", getFlowNodeGraph().vertexSet().size());
LOG.info("Edge Cnt: {} ", getFlowNodeGraph().edgeSet().size());
LOG.info("Src Set: {} ", getFlowNodeGraph().getSourceElements());
LOG.info("Snk Set: {} ", getFlowNodeGraph().getSinkElements());
LOG.info("##############");
while(iterator.hasNext()) {
FlowNode next = iterator.next();
LOG.info("Node cnt: {} ", next.getElementGraph().vertexSet().size());
LOG.info("Edge cnt: {} ", next.getElementGraph().edgeSet().size());
LOG.info("Nodes: {} ", next.getElementGraph().vertexSet());
LOG.info("-----------");
}
}
public void buildFlinkProgram(FlowProcess flowProcess) {
printFlowStep();
int numMappers;
try {
numMappers = Integer.parseInt(((FlinkFlowProcess) flowProcess).getConfig().get("flink.num.sourceTasks"));
} catch (NumberFormatException e) {
numMappers = -1;
}
int numReducers;
try {
numReducers = Integer.parseInt(((FlinkFlowProcess) flowProcess).getConfig().get("flink.num.shuffleTasks"));
} catch (NumberFormatException e) {
numReducers = -1;
}
numMappers = (numMappers > 0) ? numMappers : env.getParallelism();
numReducers = (numReducers > 0) ? numReducers : env.getParallelism();
FlowNodeGraph flowNodeGraph = getFlowNodeGraph();
Iterator iterator = flowNodeGraph.getTopologicalIterator();
Map> flinkMemo = new HashMap<>();
while(iterator.hasNext()) {
FlowNode node = iterator.next();
Set all = node.getElementGraph().vertexSet();
Set sources = getSources(node);
Set sinks = getSinks(node);
Set inner = getInnerElements(node);
// SOURCE
if (sources.size() == 1 &&
allOfType(sources, Tap.class) &&
sinks.size() == 1 &&
allOfType(sinks, Boundary.class)) {
DataSet sourceFlow = translateSource(flowProcess, env, node, numMappers);
for(FlowElement sink : sinks) {
flinkMemo.put(sink, sourceFlow);
}
}
// SINK
else if (sources.size() == 1 &&
allOfType(sources, Boundary.class) &&
sinks.size() == 1 &&
allOfType(sinks, Tap.class)) {
DataSet input = (DataSet) flinkMemo.get(getSingle(sources));
translateSink(flowProcess, input, node);
}
// SPLIT or EMPTY NODE (Single boundary source, one or more boundary sinks & no intermediate nodes)
else if (sources.size() == 1 &&
allOfType(sources, Boundary.class) &&
sinks.size() >= 1 &&
allOfType(sinks, Boundary.class) &&
inner.size() == 0 ) {
// just forward
for(FlowElement sink : sinks) {
flinkMemo.put(sink, flinkMemo.get(getSingle(sources)));
}
}
// INPUT OF GROUPBY (one or more boundary sources, single groupBy sink, no inner)
else if(sources.size() > 0 &&
allOfType(sources, Boundary.class) &&
sinks.size() == 1 &&
allOfType(sinks, GroupBy.class) &&
inner.size() == 0) {
GroupBy groupBy = (GroupBy)getSingle(sinks);
List> groupByInputs = new ArrayList<>(sources.size());
for(FlowElement e : sources) {
groupByInputs.add((DataSet)flinkMemo.get(e));
}
// prepare groupBy input
DataSet groupByInput = prepareGroupByInput(groupByInputs, node);
flinkMemo.put(groupBy, groupByInput);
}
// GROUPBY (Single groupBy source)
else if (sources.size() == 1 &&
allOfType(sources, GroupBy.class)) {
DataSet input = (DataSet)flinkMemo.get(getSingle(sources));
DataSet grouped = translateGroupBy(input, node, numReducers);
for(FlowElement sink : sinks) {
flinkMemo.put(sink, grouped);
}
}
// INPUT OF COGROUP (one or more boundary sources, single coGroup sink, no inner)
else if(sources.size() > 0 &&
allOfType(sources, Boundary.class) &&
sinks.size() == 1 &&
allOfType(sinks, CoGroup.class) &&
inner.size() == 0) {
CoGroup coGroup = (CoGroup)getSingle(sinks);
List> coGroupInputs = new ArrayList<>(sources.size());
for(FlowElement e : getNodeInputsInOrder(node, coGroup)) {
coGroupInputs.add((DataSet)flinkMemo.get(e));
}
// prepare coGroup input
DataSet> input = prepareCoGroupInput(coGroupInputs, node, numReducers);
flinkMemo.put(coGroup, input);
}
// COGROUP (Single CoGroup source)
else if (sources.size() == 1 &&
allOfType(sources, CoGroup.class)) {
CoGroup coGroup = (CoGroup)getSingle(sources);
DataSet> input = flinkMemo.get(coGroup);
DataSet coGrouped = translateCoGroup(input, node, numReducers);
for(FlowElement sink : sinks) {
flinkMemo.put(sink, coGrouped);
}
}
// HASHJOIN (one or more boundary source, followed by a single HashJoin)
else if(sources.size() > 0 &&
allOfType(sources, Boundary.class) &&
getCommonSuccessor(sources, node) instanceof HashJoin) {
HashJoin hashJoin = (HashJoin)getCommonSuccessor(sources, node);
List> hashJoinInputs = new ArrayList<>(sources.size());
for(FlowElement e : getNodeInputsInOrder(node, hashJoin)) {
hashJoinInputs.add((DataSet)flinkMemo.get(e));
}
DataSet joined = translateHashJoin(hashJoinInputs, node);
for(FlowElement sink : sinks) {
flinkMemo.put(sink, joined);
}
}
// MERGE (multiple boundary sources, single boundary sink, single merge inner)
else if (sources.size() > 1 &&
allOfType(sources, Boundary.class) &&
sinks.size() == 1 &&
allOfType(sinks, Boundary.class) &&
inner.size() == 1 &&
allOfType(inner, Merge.class)) {
List> mergeInputs = new ArrayList<>(sources.size());
for(FlowElement e : sources) {
mergeInputs.add((DataSet)flinkMemo.get(e));
}
DataSet unioned = translateMerge(mergeInputs, node);
for(FlowElement sink : sinks) {
flinkMemo.put(sink, unioned);
}
}
// MAP (Single boundary source AND nothing else matches)
else if (sources.size() == 1 &&
allOfType(sources, Boundary.class)) {
DataSet input = (DataSet)flinkMemo.get(getSingle(sources));
DataSet mapped = translateMap(input, node);
for(FlowElement sink : sinks) {
flinkMemo.put(sink, mapped);
}
}
else {
throw new RuntimeException("Could not translate this node: "+node.getElementGraph().vertexSet());
}
}
}
private DataSet translateSource(FlowProcess flowProcess, ExecutionEnvironment env, FlowNode node, int dop) {
Tap tap = this.getSingle(node.getSourceTaps());
JobConf tapConfig = new JobConf(this.getNodeConfig(node));
tap.sourceConfInit(flowProcess, tapConfig);
tapConfig.set( "cascading.step.source", Tap.id( tap ) );
Fields outFields = tap.getSourceFields();
registerKryoTypes(outFields);
JobConf sourceConfig = new JobConf(this.getNodeConfig(node));
MultiInputFormat.addInputFormat(sourceConfig, tapConfig);
DataSet src = env
.createInput(new TapInputFormat(node), new TupleTypeInfo(outFields))
.name(tap.getIdentifier())
.setParallelism(dop)
.withParameters(FlinkConfigConverter.toFlinkConfig(new Configuration(sourceConfig)));
return src;
}
private void translateSink(FlowProcess flowProcess, DataSet input, FlowNode node) {
Tap tap = this.getSingle(node.getSinkTaps());
Configuration sinkConfig = this.getNodeConfig(node);
tap.sinkConfInit(flowProcess, sinkConfig);
int desiredDop = tap.getScheme().getNumSinkParts();
int inputDop = ((Operator)input).getParallelism();
int dop;
if (inputDop == 1) {
// input operators have dop 1. Probably because they perform a non-keyed reduce or coGroup
dop = 1;
}
else {
if (desiredDop > 0) {
// output dop explicitly set.
if (input instanceof GroupReduceOperator) {
// input is a reduce and we must preserve its sorting.
// we must set the desired dop also for reduce and related operators
adjustDopOfReduceOrCoGroup((GroupReduceOperator) input, desiredDop);
}
dop = desiredDop;
}
else {
dop = inputDop;
}
}
input
.output(new TapOutputFormat(node))
.name(tap.getIdentifier())
.setParallelism(dop)
.withParameters(FlinkConfigConverter.toFlinkConfig(sinkConfig));
}
/**
* Adjusts the parallelism of a GroupReduce operator (and all associated operators) that
* belongs to a Cascading GroupBy or CoGroup pipe.
* This needs to be done if the result must be emitted in order and a specific sink
* parallelism is requested.
*
* @param reduceOp The operator whose DOP needs to be adjusted
* @param dop The parallelism to set
*/
private void adjustDopOfReduceOrCoGroup(GroupReduceOperator reduceOp, int dop) {
reduceOp.setParallelism(dop);
DataSet reduceInput = reduceOp.getInput();
if (reduceInput instanceof SortPartitionOperator) {
// We have a Reduce operator whose grouping keys need to be reversely ordered.
// This yields: input -> PartitionOperator -> SortPartitionOperator -> GroupReduceOperator.
// The DOPs of the PartitionOperator and SortPartitionOperator must be adjusted.
SortPartitionOperator sortOp = (SortPartitionOperator)reduceInput;
sortOp.setParallelism(dop);
DataSet sortInput = sortOp.getInput();
if (sortInput instanceof PartitionOperator) {
PartitionOperator partitionOp = (PartitionOperator)sortInput;
partitionOp.setParallelism(dop);
}
}
else if (reduceInput instanceof JoinOperator &&
((JoinOperator)reduceInput).getJoinHint() == JoinHint.REPARTITION_SORT_MERGE) {
// We have a CoGroup operator whose input is processed by one or more sort-merge outer joins.
// The DOPs of all outer joins must be adjusted.
JoinOperator joinOp = (JoinOperator)reduceInput;
while (joinOp != null && joinOp.getJoinHint() == JoinHint.REPARTITION_SORT_MERGE) {
joinOp.setParallelism(dop);
DataSet leftJoinInput = joinOp.getInput1();
if (leftJoinInput instanceof JoinOperator) {
joinOp = (JoinOperator)leftJoinInput;
}
else {
joinOp = null;
}
}
}
}
private DataSet translateMap(DataSet input, FlowNode node) {
Fields outFields = getOutScope(node).getOutValuesFields();
registerKryoTypes(outFields);
int dop = ((Operator)input).getParallelism();
return input
.mapPartition(new EachMapper(node))
.returns(new TupleTypeInfo(outFields))
.withParameters(this.getFlinkNodeConfig(node))
.setParallelism(dop)
.name("map-" + node.getID());
}
private DataSet prepareGroupByInput(List> inputs, FlowNode node) {
DataSet merged = null;
for(int i=0; i input = inputs.get(i);
if(merged == null) {
merged = input;
}
else {
merged = merged
.union(input);
}
}
return merged;
}
private DataSet translateGroupBy(DataSet input, FlowNode node, int dop) {
GroupBy groupBy = (GroupBy) node.getSourceElements().iterator().next();
Scope outScope = getOutScope(node);
List inScopes = getInputScopes(node, groupBy);
Fields outFields;
if(outScope.isEvery()) {
outFields = outScope.getOutGroupingFields();
}
else {
outFields = outScope.getOutValuesFields();
}
registerKryoTypes(outFields);
// get input scope
Scope inScope = inScopes.get(0);
// get grouping keys
Fields groupKeyFields = groupBy.getKeySelectors().get(inScope.getName());
// get group sorting keys
Fields sortKeyFields = groupBy.getSortingSelectors().get(inScope.getName());
String[] groupKeys = registerKeyFields(input, groupKeyFields);
String[] sortKeys = null;
if (sortKeyFields != null) {
sortKeys = registerKeyFields(input, sortKeyFields);
}
Order sortOrder = groupBy.isSortReversed() ? Order.DESCENDING : Order.ASCENDING;
if(sortOrder == Order.DESCENDING) {
// translate groupBy with inverse sort order
return translateInverseSortedGroupBy(input, node, dop, groupKeys, sortKeys, outFields);
}
else if(groupKeys == null || groupKeys.length == 0) {
// translate key-less (global) groupBy
return translateGlobalGroupBy(input, node, dop, sortKeys, sortOrder, outFields);
}
else {
UnsortedGrouping grouping = input
.groupBy(groupKeys);
if(sortKeys != null && sortKeys.length > 0) {
// translate groupBy with group sorting
SortedGrouping sortedGrouping = grouping
.sortGroup(sortKeys[0], Order.ASCENDING);
for(int i=1; i translateGlobalGroupBy(DataSet input, FlowNode node, int dop,
String[] sortKeys, Order sortOrder, Fields outFields) {
DataSet result = input;
// sort on sorting keys if necessary
if(sortKeys != null && sortKeys.length > 0) {
result = result
.sortPartition(sortKeys[0], sortOrder)
.setParallelism(1)
.name("reduce-"+ node.getID());
for(int i=1; i translateInverseSortedGroupBy(DataSet input, FlowNode node, int dop,
String[] groupKeys, String[] sortKeys, Fields outFields) {
DataSet result = input;
// hash partition and sort on grouping keys if necessary
if(groupKeys != null && groupKeys.length > 0) {
// hash partition
result = result
.partitionByHash(groupKeys)
.setParallelism(dop)
.name("reduce-" + node.getID());
// sort on grouping keys
result = result
.sortPartition(groupKeys[0], Order.DESCENDING)
.setParallelism(dop)
.name("reduce-" + node.getID());
for(int i=1; i 0) {
result = result
.sortPartition(sortKeys[0], Order.DESCENDING)
.setParallelism(dop)
.name("reduce-" + node.getID());
for(int i=1; i translateMerge(List> inputs, FlowNode node) {
DataSet unioned = null;
TypeInformation type = null;
int maxDop = -1;
for(DataSet input : inputs) {
maxDop = Math.max(maxDop, ((Operator)input).getParallelism());
if(unioned == null) {
unioned = input;
type = input.getType();
}
else {
unioned = unioned.union(input);
}
}
return unioned.map(new IdMapper())
.returns(type)
.setParallelism(maxDop);
}
private DataSet> prepareCoGroupInput(List> inputs, FlowNode node, int dop) {
CoGroup coGroup = (CoGroup)getSingle(node.getSinkElements());
Joiner joiner = coGroup.getJoiner();
int numJoinInputs = coGroup.isSelfJoin() ? coGroup.getNumSelfJoins() + 1 : inputs.size();
Fields[] inputFields = new Fields[numJoinInputs];
Fields[] keyFields = new Fields[numJoinInputs];
String[][] flinkKeys = new String[numJoinInputs][];
List> joinInputs = computeSpliceInputsFieldsKeys(coGroup, node, inputs, inputFields, keyFields, flinkKeys);
if(joiner.getClass().equals(InnerJoin.class)) {
if(!keyFields[0].isNone()) {
return prepareFullOuterCoGroupInput(joinInputs, node, inputFields, keyFields, flinkKeys, dop);
}
else {
// Cartesian product
return prepareInnerCrossInput(joinInputs, node, inputFields, dop);
}
}
else if(joiner.getClass().equals(BufferJoin.class)) {
return prepareBufferCoGroupInput(joinInputs, node, inputFields, keyFields, flinkKeys, dop);
}
else {
return prepareFullOuterCoGroupInput(joinInputs, node, inputFields, keyFields, flinkKeys, dop);
}
}
private DataSet> prepareFullOuterCoGroupInput(List> inputs,
FlowNode node, Fields[] inputFields, Fields[] keyFields, String[][] flinkKeys, int dop) {
int numJoinInputs = inputs.size();
TupleTypeInfo keysTypeInfo = inputFields[0].isDefined() ?
new TupleTypeInfo(inputFields[0].select(keyFields[0])) :
new TupleTypeInfo(Fields.UNKNOWN);
keysTypeInfo.registerKeyFields(keyFields[0]);
TypeInformation> tupleJoinListsTypeInfo =
new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
keysTypeInfo,
new TupleArrayTypeInfo(numJoinInputs, Arrays.copyOf(inputFields, 2))
);
String[] listKeys = new String[flinkKeys[0].length];
String[] listKeysFwd = new String[flinkKeys[0].length];
for(int i=0; i "+listKeys[i];
}
// first outer join with CoGroup
DataSet> tupleJoinLists = inputs.get(0)
.fullOuterJoin(inputs.get(1), JoinHint.REPARTITION_SORT_MERGE)
.where(flinkKeys[0]).equalTo(flinkKeys[1])
.with(new TupleOuterJoiner(numJoinInputs,
inputFields[0], keyFields[0],
inputFields[1], keyFields[1]))
.returns(tupleJoinListsTypeInfo)
.withForwardedFieldsFirst(listKeysFwd)
.setParallelism(dop)
.name("coGroup-" + node.getID());
// further outer joins with CoGroup
for (int i = 2; i < inputs.size(); i++) {
tupleJoinListsTypeInfo =
new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
keysTypeInfo,
new TupleArrayTypeInfo(numJoinInputs, Arrays.copyOf(inputFields, i+1))
);
tupleJoinLists = tupleJoinLists
.fullOuterJoin(inputs.get(i), JoinHint.REPARTITION_SORT_MERGE)
.where(listKeys).equalTo(flinkKeys[i])
.with(new TupleAppendOuterJoiner(i, numJoinInputs, inputFields[i], keyFields[i]))
.returns(tupleJoinListsTypeInfo)
.withForwardedFieldsFirst(listKeys)
.setParallelism(dop)
.name("coGroup-" + node.getID());
}
return tupleJoinLists;
}
private DataSet> prepareInnerCrossInput(List> inputs, FlowNode node, Fields[] inputFields, int dop) {
int numJoinInputs = inputs.size();
TypeInformation> tupleJoinListsTypeInfo =
new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
new TupleTypeInfo(Fields.UNKNOWN),
new TupleArrayTypeInfo(numJoinInputs, Arrays.copyOf(inputFields, 1))
);
int mapDop = ((Operator)inputs.get(0)).getParallelism();
// prepare tuple list for join
DataSet> tupleJoinLists = inputs.get(0)
.map(new JoinPrepareMapper(numJoinInputs, null, null))
.returns(tupleJoinListsTypeInfo)
.setParallelism(mapDop)
.name("coGroup-" + node.getID());
for (int i = 1; i < inputs.size(); i++) {
tupleJoinListsTypeInfo =
new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
new TupleTypeInfo(Fields.UNKNOWN),
new TupleArrayTypeInfo(numJoinInputs, Arrays.copyOf(inputFields, i+1))
);
tupleJoinLists = tupleJoinLists.crossWithTiny(inputs.get(i))
.with(new TupleAppendCrosser(i))
.returns(tupleJoinListsTypeInfo)
.setParallelism(dop)
.name("coGroup-" + node.getID());
}
return tupleJoinLists;
}
private DataSet> prepareBufferCoGroupInput(List> inputs,
FlowNode node, Fields[] inputFields, Fields[] keyFields, String[][] flinkKeys, int dop) {
DataSet> coGroupInput = null;
for(int i=0; i input = inputs.get(i);
// get keys
int[] keyPos = inputFields[i].getPos(keyFields[i]);
if(keyFields[i].isNone()) {
// set default key
keyFields[i] = new Fields("defaultKey");
}
TupleTypeInfo keysTypeInfo = inputFields[i].isDefined() ?
new TupleTypeInfo(inputFields[i].select(keyFields[i])) :
new TupleTypeInfo(Fields.UNKNOWN);
TypeInformation> keyedType =
new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
keysTypeInfo,
BasicTypeInfo.INT_TYPE_INFO,
new TupleTypeInfo(inputFields[i])
);
int inputDop = ((Operator)input).getParallelism();
// add mapper
DataSet> keyedInput = input
.map(new BufferJoinKeyExtractor(i, keyPos))
.returns(keyedType)
.setParallelism(inputDop)
.name("coGroup-" + node.getID());
// add to groupByInput
if(coGroupInput == null) {
coGroupInput = keyedInput;
}
else {
coGroupInput = coGroupInput
.union(keyedInput);
}
}
return coGroupInput;
}
private DataSet translateCoGroup(DataSet> input, FlowNode node, int dop) {
CoGroup coGroup = (CoGroup)getSingle(node.getSourceElements());
// get out fields of node
Scope outScope = getOutScope(node);
Fields outFields;
if(outScope.isEvery()) {
outFields = outScope.getOutGroupingFields();
}
else {
outFields = outScope.getOutValuesFields();
}
registerKryoTypes(outFields);
// get key and value fields of inputs
List inScopes = getInputScopes(node, coGroup);
Fields keyFields = coGroup.getKeySelectors().get(inScopes.get(0).getName());
Joiner joiner = coGroup.getJoiner();
if(!(joiner instanceof BufferJoin)) {
if (keyFields != Fields.NONE) {
String[] groupingKeys = new String[keyFields.size()];
for (int i = 0; i < groupingKeys.length; i++) {
groupingKeys[i] = "f0." + i;
}
DataSet joinResult = ((DataSet>) input)
.groupBy(groupingKeys)
.reduceGroup(new CoGroupReducer(node))
.withParameters(this.getFlinkNodeConfig(node))
.setParallelism(dop)
.returns(new TupleTypeInfo(outFields))
.name("cogroup-" + node.getID());
return joinResult;
} else {
DataSet joinResult = ((DataSet>) input)
.reduceGroup(new CoGroupReducer(node))
.withParameters(this.getFlinkNodeConfig(node))
.setParallelism(1)
.returns(new TupleTypeInfo(outFields))
.name("cogroup-" + node.getID());
return joinResult;
}
}
else {
// Buffer Join
if (keyFields != Fields.NONE) {
return ((DataSet>) input)
.groupBy("f0.*")
.sortGroup(1, Order.DESCENDING)
.reduceGroup(new CoGroupBufferReducer(node))
.withParameters(this.getFlinkNodeConfig(node))
.setParallelism(dop)
.returns(new TupleTypeInfo(outFields))
.name("coGroup-" + node.getID());
}
else {
return ((DataSet>) input)
.sortPartition(1, Order.DESCENDING)
.setParallelism(1)
.reduceGroup(new CoGroupBufferReducer(node))
.withParameters(this.getFlinkNodeConfig(node))
.setParallelism(1)
.returns(new TupleTypeInfo(outFields))
.name("coGroup-" + node.getID());
}
}
}
private DataSet translateHashJoin(List> inputs, FlowNode node) {
HashJoin hashJoin = (HashJoin) getCommonSuccessor(node.getSourceElements(), node);
Joiner joiner = hashJoin.getJoiner();
// check if joiner is a Scalding WrappedJoiner and
// try to extract the joiner which is wrapped inside
if (joiner.getClass().getName().equals("com.twitter.scalding.WrappedJoiner")) {
try {
Field joinerField = joiner.getClass().getDeclaredField("joiner");
joinerField.setAccessible(true);
joiner = (Joiner)joinerField.get(joiner);
}
catch(NoSuchFieldException | IllegalAccessException nsfe) {
nsfe.printStackTrace();
LOG.warn("Could not extract joiner from Scalding's WrappedJoiner. " +
"Will continue without extracting joiner.");
}
}
int numJoinInputs = hashJoin.isSelfJoin() ? hashJoin.getNumSelfJoins() + 1 : inputs.size();
Fields[] inputFields = new Fields[numJoinInputs];
Fields[] keyFields = new Fields[numJoinInputs];
String[][] flinkKeys = new String[numJoinInputs][];
List> joinInputs = computeSpliceInputsFieldsKeys(hashJoin, node, inputs, inputFields, keyFields, flinkKeys);
if(keyFields[0].isNone()) {
// Cartesian product
return translateInnerCrossProduct(node, joinInputs);
}
else if(joiner.getClass().equals(InnerJoin.class)) {
// inner join with keys
return translateInnerHashJoin(node, joinInputs, inputFields, keyFields, flinkKeys);
}
else if (joiner.getClass().equals(LeftJoin.class)) {
return translateLeftHashJoin(node, joinInputs, inputFields, keyFields, flinkKeys);
}
else {
System.out.println(joiner.getClass().getName());
throw new FlowException("HashJoin does only support InnerJoin and LeftJoin but is " +
joiner.getClass().getName());
}
}
private DataSet translateInnerHashJoin(FlowNode node, List> inputs, Fields[] inputFields, Fields[] keyFields, String[][] flinkKeys) {
int numJoinInputs = inputs.size();
// get out fields of node
Scope outScope = getOutScope(node);
Fields outFields;
if (outScope.isEvery()) {
outFields = outScope.getOutGroupingFields();
} else {
outFields = outScope.getOutValuesFields();
}
registerKryoTypes(outFields);
int probeSideDOP = ((Operator)inputs.get(0)).getParallelism();
if(numJoinInputs == 2) {
// binary join
return inputs.get(0).join(inputs.get(1), JoinHint.BROADCAST_HASH_SECOND)
.where(flinkKeys[0]).equalTo(flinkKeys[1])
.with(new BinaryHashJoinJoiner(node, inputFields[0], keyFields[0]))
.withParameters(this.getFlinkNodeConfig(node))
.setParallelism(probeSideDOP)
.returns(new TupleTypeInfo(outFields))
.name("hashjoin-" + node.getID());
}
else {
// nary join
TupleTypeInfo keysTypeInfo = inputFields[0].isDefined() ?
new TupleTypeInfo(inputFields[0].select(keyFields[0])) :
new TupleTypeInfo(Fields.UNKNOWN);
keysTypeInfo.registerKeyFields(keyFields[0]);
TypeInformation> tupleJoinListsTypeInfo =
new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
keysTypeInfo,
new TupleArrayTypeInfo(numJoinInputs-1, Arrays.copyOf(inputFields, 1))
);
int mapDop = ((Operator) inputs.get(0)).getParallelism();
// prepare tuple list for join
DataSet> tupleJoinLists = inputs.get(0)
.map(new JoinPrepareMapper(numJoinInputs - 1, inputFields[0], keyFields[0]))
.returns(tupleJoinListsTypeInfo)
.setParallelism(mapDop)
.name("hashjoin-" + node.getID());
for (int i = 0; i < flinkKeys[0].length; i++) {
flinkKeys[0][i] = "f0." + i;
}
// join all inputs except last
for (int i = 1; i < inputs.size()-1; i++) {
tupleJoinListsTypeInfo =
new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
keysTypeInfo,
new TupleArrayTypeInfo(numJoinInputs-1, Arrays.copyOf(inputFields, i+1))
);
tupleJoinLists = tupleJoinLists.join(inputs.get(i), JoinHint.BROADCAST_HASH_SECOND)
.where(flinkKeys[0]).equalTo(flinkKeys[i])
.with(new TupleAppendJoiner(i))
.returns(tupleJoinListsTypeInfo)
.withForwardedFieldsFirst(flinkKeys[0])
.setParallelism(probeSideDOP)
.name("hashjoin-" + node.getID());
}
// join last input
return tupleJoinLists.join(inputs.get(numJoinInputs-1), JoinHint.BROADCAST_HASH_SECOND)
.where(flinkKeys[0]).equalTo(flinkKeys[numJoinInputs-1])
.with(new NaryHashJoinJoiner(node, numJoinInputs))
.withParameters(this.getFlinkNodeConfig(node))
.setParallelism(probeSideDOP)
.returns(new TupleTypeInfo(outFields))
.name("hashjoin-" + node.getID());
}
}
private DataSet translateLeftHashJoin(FlowNode node, List> inputs, Fields[] inputFields, Fields[] keyFields, String[][] flinkKeys) {
int numJoinInputs = inputs.size();
// get out fields of node
Scope outScope = getOutScope(node);
Fields outFields;
if (outScope.isEvery()) {
outFields = outScope.getOutGroupingFields();
} else {
outFields = outScope.getOutValuesFields();
}
registerKryoTypes(outFields);
int probeSideDOP = ((Operator)inputs.get(0)).getParallelism();
if(numJoinInputs == 2) {
// binary join
return inputs.get(0)
.leftOuterJoin(inputs.get(1), JoinHint.BROADCAST_HASH_SECOND)
.where(flinkKeys[0]).equalTo(flinkKeys[1])
.with(new BinaryHashJoinJoiner(node, inputFields[0], keyFields[0]))
.withParameters(this.getFlinkNodeConfig(node))
.setParallelism(probeSideDOP)
.returns(new TupleTypeInfo(outFields))
.name("hashjoin-" + node.getID());
}
else {
// nary join
TupleTypeInfo keysTypeInfo = inputFields[0].isDefined() ?
new TupleTypeInfo(inputFields[0].select(keyFields[0])) :
new TupleTypeInfo(Fields.UNKNOWN);
keysTypeInfo.registerKeyFields(keyFields[0]);
TypeInformation> tupleJoinListsTypeInfo =
new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
keysTypeInfo,
new TupleArrayTypeInfo(numJoinInputs-1, Arrays.copyOf(inputFields, 1))
);
// prepare tuple list for join
DataSet> tupleJoinLists = inputs.get(0)
.map(new JoinPrepareMapper(numJoinInputs - 1, inputFields[0], keyFields[0]))
.returns(tupleJoinListsTypeInfo)
.setParallelism(probeSideDOP)
.name("hashjoin-" + node.getID());
for (int i = 0; i < flinkKeys[0].length; i++) {
flinkKeys[0][i] = "f0." + i;
}
// join all inputs except last
for (int i = 1; i < inputs.size()-1; i++) {
tupleJoinListsTypeInfo =
new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
keysTypeInfo,
new TupleArrayTypeInfo(numJoinInputs-1, Arrays.copyOf(inputFields, i+1))
);
tupleJoinLists = tupleJoinLists
.join(inputs.get(i), JoinHint.BROADCAST_HASH_SECOND)
.where(flinkKeys[0]).equalTo(flinkKeys[i])
.with(new TupleAppendJoiner(i))
.returns(tupleJoinListsTypeInfo)
.withForwardedFieldsFirst(flinkKeys[0])
.setParallelism(probeSideDOP)
.name("hashjoin-" + node.getID());
}
// join last input
return tupleJoinLists
.leftOuterJoin(inputs.get(numJoinInputs-1), JoinHint.BROADCAST_HASH_SECOND)
.where(flinkKeys[0]).equalTo(flinkKeys[numJoinInputs-1])
.with(new NaryHashJoinJoiner(node, numJoinInputs))
.withParameters(this.getFlinkNodeConfig(node))
.setParallelism(probeSideDOP)
.returns(new TupleTypeInfo(outFields))
.name("hashjoin-" + node.getID());
}
}
private DataSet translateInnerCrossProduct(FlowNode node, List> inputs) {
int numJoinInputs = inputs.size();
// get out fields of node
Scope outScope = getOutScope(node);
Fields outFields;
if (outScope.isEvery()) {
outFields = outScope.getOutGroupingFields();
} else {
outFields = outScope.getOutValuesFields();
}
registerKryoTypes(outFields);
int probeSideDOP = ((Operator)inputs.get(0)).getParallelism();
TypeInformation> tupleJoinListsTypeInfo =
new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
new TupleTypeInfo(Fields.UNKNOWN),
ObjectArrayTypeInfo.getInfoFor(new TupleTypeInfo(Fields.UNKNOWN))
);
// prepare tuple list for join
DataSet> tupleJoinLists = inputs.get(0)
.map(new JoinPrepareMapper(numJoinInputs, null, null))
.returns(tupleJoinListsTypeInfo)
.setParallelism(probeSideDOP)
.name("hashjoin-" + node.getID());
for (int i = 1; i < inputs.size(); i++) {
tupleJoinLists = tupleJoinLists.crossWithTiny(inputs.get(i))
.with(new TupleAppendCrosser(i))
.returns(tupleJoinListsTypeInfo)
.setParallelism(probeSideDOP)
.name("hashjoin-" + node.getID());
}
return tupleJoinLists
.mapPartition(new HashJoinMapper(node))
.withParameters(this.getFlinkNodeConfig(node))
.setParallelism(probeSideDOP)
.returns(new TupleTypeInfo(outFields))
.name("hashjoin-" + node.getID());
}
private List> computeSpliceInputsFieldsKeys(Splice splice, FlowNode node, List> inputs, Fields[] inputFields, Fields[] keyFields, String[][] flinkKeys) {
int numJoinInputs = splice.isSelfJoin() ? splice.getNumSelfJoins() + 1 : inputs.size();
List inScopes = getInputScopes(node, splice);
List> inputs2;
// collect key and value fields of inputs
if(!splice.isSelfJoin()) {
// regular join with different inputs
for (int i = 0; i < numJoinInputs; i++) {
// get input scope
Scope inScope = inScopes.get(i);
// get join key fields
inputFields[i] = ((TupleTypeInfo)inputs.get(i).getType()).getSchema();
keyFields[i] = splice.getKeySelectors().get(inScope.getName());
flinkKeys[i] = registerKeyFields(inputs.get(i), keyFields[i]);
}
inputs2 = inputs;
}
else {
// self join
Scope inScope = inScopes.get(0);
// get join key fields
inputFields[0] = ((TupleTypeInfo)inputs.get(0).getType()).getSchema();
keyFields[0] = splice.getKeySelectors().get(inScope.getName());
flinkKeys[0] = registerKeyFields(inputs.get(0), keyFields[0]);
for (int i = 1; i < numJoinInputs; i++) {
inputFields[i] = inputFields[0];
keyFields[i] = keyFields[0];
flinkKeys[i] = Arrays.copyOf(flinkKeys[0], flinkKeys[0].length);
}
// duplicate self join input to treat it like a regular join
inputs2 = new ArrayList<>(numJoinInputs);
for(int i=0; i getInputScopes(FlowNode node, Splice splice) {
Pipe[] inputs = splice.getPrevious();
List inScopes = new ArrayList<>(inputs.length);
for(Pipe input : inputs) {
boolean found = false;
for (Scope inScope : node.getPreviousScopes(splice)) {
if(inScope.getName().equals(input.getName())) {
inScopes.add(inScope);
found = true;
break;
}
}
if(!found) {
throw new RuntimeException("Input scope was not found");
}
}
return inScopes;
}
private FlowElement[] getNodeInputsInOrder(FlowNode node, Splice splice) {
Map posMap = splice.getPipePos();
FlowElement[] spliceInputs = new FlowElement[posMap.size()];
ElementGraph eg = node.getElementGraph();
for(FlowElement nodeSource : getSources(node)) {
int idx = posMap.get(eg.getEdge(nodeSource, splice).getName());
spliceInputs[idx] = nodeSource;
}
return spliceInputs;
}
private Set getSources(FlowNode node) {
return node.getSourceElements();
}
private Set getSinks(FlowNode node) {
return node.getSinkElements();
}
private Set getInnerElements(FlowNode node) {
Set inner = new HashSet<>(node.getElementGraph().vertexSet());
inner.removeAll(getSources(node));
inner.removeAll(getSinks(node));
Set toRemove = new HashSet<>();
for(FlowElement e : inner) {
if(e instanceof Extent) {
toRemove.add(e);
}
}
inner.removeAll(toRemove);
return inner;
}
private Scope getOutScope(FlowNode node) {
Set nodeSinks = node.getSinkElements();
if(nodeSinks.size() != 1) {
throw new RuntimeException("Only nodes with one output supported right now");
}
FlowElement sink = nodeSinks.iterator().next();
Collection outScopes = (Collection) node.getPreviousScopes(sink);
if(outScopes.size() != 1) {
throw new RuntimeException("Only one incoming scope for last node of mapper allowed");
}
return outScopes.iterator().next();
}
private boolean allOfType(Set set, Class extends FlowElement> type) {
for(FlowElement e : set) {
if(!(type.isInstance(e))) {
return false;
}
}
return true;
}
private FlowElement getCommonSuccessor(Set set, FlowNode node) {
ElementGraph graph = node.getElementGraph();
FlowElement successor = null;
for(FlowElement e : set) {
List successors = graph.successorListOf(e);
if(successors.size() > 1) {
return null;
}
else {
if(successor == null) {
successor = successors.get(0);
}
else if(successor != successors.get(0)){
return null;
}
}
}
return successor;
}
private X getSingle(Set set) {
if(set.size() != 1) {
throw new RuntimeException("Set size > 1");
}
return set.iterator().next();
}
private String[] registerKeyFields(DataSet input, Fields keyFields) {
return ((TupleTypeInfo)input.getType()).registerKeyFields(keyFields);
}
private void registerKryoTypes(Fields fields) {
if(fields.hasTypes()) {
Class[] fieldTypeClasses = fields.getTypesClasses();
for(Class fieldTypeClass : fieldTypeClasses) {
if(!fieldTypeClass.isPrimitive() &&
!fieldTypeClass.equals(String.class) &&
!Writable.class.isAssignableFrom(fieldTypeClass)) {
// register type if it is neither a primitive, String, or Writable
env.getConfig().registerKryoType(fieldTypeClass);
}
}
}
}
private org.apache.flink.configuration.Configuration getFlinkNodeConfig(FlowNode node) {
return FlinkConfigConverter.toFlinkConfig(this.getNodeConfig(node));
}
private Configuration getNodeConfig(FlowNode node) {
Configuration nodeConfig = HadoopUtil.copyConfiguration(this.getConfig());
ConfigurationSetter configSetter = new ConfigurationSetter(nodeConfig);
this.initConfFromNodeConfigDef(node.getElementGraph(), configSetter);
this.initConfFromStepConfigDef(configSetter);
nodeConfig.set("cascading.flow.node.num", Integer.toString(node.getOrdinal()));
return nodeConfig;
}
private static class ConfigurationSetter implements ConfigDef.Setter
{
private final Configuration conf;
public ConfigurationSetter( Configuration conf )
{
this.conf = conf;
}
@Override
public String set( String key, String value ) {
String oldValue = get( key );
conf.set( key, value );
return oldValue;
}
@Override
public String update( String key, String value ) {
String oldValue = get( key );
if( oldValue == null ) {
conf.set(key, value);
}
else if( !oldValue.contains( value ) ) {
conf.set(key, oldValue + "," + value);
}
return oldValue;
}
@Override
public String get( String key ) {
String value = conf.get( key );
if( value == null || value.isEmpty() ) {
return null;
}
return value;
}
}
}