eu.stratosphere.spargel.java.VertexCentricIteration Maven / Gradle / Ivy
/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.spargel.java;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.lang3.Validate;
import eu.stratosphere.api.common.aggregators.Aggregator;
import eu.stratosphere.api.common.functions.GenericCoGrouper;
import eu.stratosphere.api.common.operators.BinaryOperatorInformation;
import eu.stratosphere.api.common.operators.DualInputSemanticProperties;
import eu.stratosphere.api.common.operators.Operator;
import eu.stratosphere.api.java.DataSet;
import eu.stratosphere.api.java.functions.CoGroupFunction;
import eu.stratosphere.api.java.operators.CustomUnaryOperation;
import eu.stratosphere.api.java.operators.TwoInputOperator;
import eu.stratosphere.api.common.operators.base.CoGroupOperatorBase;
import eu.stratosphere.api.common.operators.base.DeltaIterationBase;
import eu.stratosphere.api.java.tuple.Tuple;
import eu.stratosphere.api.java.tuple.Tuple2;
import eu.stratosphere.api.java.tuple.Tuple3;
import eu.stratosphere.api.java.typeutils.TupleTypeInfo;
import eu.stratosphere.api.java.typeutils.TypeExtractor;
import eu.stratosphere.types.TypeInformation;
import eu.stratosphere.configuration.Configuration;
import eu.stratosphere.util.Collector;
/**
* This class represents iterative graph computations, programmed in a vertex-centric perspective.
* It is a special case of Bulk Synchronous Parallel computation. The paradigm has also been
* implemented by Google's Pregel system and by Apache Giraph.
*
* Vertex centric algorithms operate on graphs, which are defined through vertices and edges. The
* algorithms send messages along the edges and update the state of vertices based on
* the old state and the incoming messages. All vertices have an initial state.
* The computation terminates once no vertex updates it state any more.
* Additionally, a maximum number of iterations (supersteps) may be specified.
*
* The computation is here represented by two functions:
*
* - The {@link VertexUpdateFunction} receives incoming messages and may updates the state for
* the vertex. If a state is updated, messages are sent from this vertex. Initially, all vertices are
* considered updated.
* - The {@link MessagingFunction} takes the new vertex state and sends messages along the outgoing
* edges of the vertex. The outgoing edges may optionally have an associated value, such as a weight.
*
*
* Vertex-centric graph iterations are instantiated by the
* {@link #withPlainEdges(DataSet, VertexUpdateFunction, MessagingFunction, int)} method, or the
* {@link #withValuedEdges(DataSet, VertexUpdateFunction, MessagingFunction, int)} method, depending on whether
* the graph's edges are carrying values.
*
* @param The type of the vertex key (the vertex identifier).
* @param The type of the vertex value (the state of the vertex).
* @param The type of the message sent between vertices along the edges.
* @param The type of the values that are associated with the edges.
*/
public class VertexCentricIteration, VertexValue, Message, EdgeValue>
implements CustomUnaryOperation, Tuple2>
{
private final VertexUpdateFunction updateFunction;
private final MessagingFunction messagingFunction;
private final DataSet> edgesWithoutValue;
private final DataSet> edgesWithValue;
private final TypeInformation messageType;
private final Map>> aggregators;
private final int maximumNumberOfIterations;
private DataSet> initialVertices;
// ----------------------------------------------------------------------------------
private VertexCentricIteration(VertexUpdateFunction uf,
MessagingFunction mf,
DataSet> edgesWithoutValue,
int maximumNumberOfIterations)
{
// check that the edges are actually a valid tuple set of vertex key types
TypeInformation> edgesType = edgesWithoutValue.getType();
Validate.isTrue(edgesType.isTupleType() && edgesType.getArity() == 2, "The edges data set (for edges without edge values) must consist of 2-tuples.");
TupleTypeInfo> tupleInfo = (TupleTypeInfo>) edgesType;
Validate.isTrue(tupleInfo.getTypeAt(0).equals(tupleInfo.getTypeAt(1))
&& Comparable.class.isAssignableFrom(tupleInfo.getTypeAt(0).getTypeClass()),
"Both tuple fields (source and target vertex id) must be of the data type that represents the vertex key and implement the java.lang.Comparable interface.");
this.updateFunction = uf;
this.messagingFunction = mf;
this.edgesWithoutValue = edgesWithoutValue;
this.edgesWithValue = null;
this.maximumNumberOfIterations = maximumNumberOfIterations;
this.aggregators = new HashMap>>();
this.messageType = getMessageType(mf);
}
private VertexCentricIteration(VertexUpdateFunction uf,
MessagingFunction mf,
DataSet> edgesWithValue,
int maximumNumberOfIterations,
boolean edgeHasValueMarker)
{
// check that the edges are actually a valid tuple set of vertex key types
TypeInformation> edgesType = edgesWithValue.getType();
Validate.isTrue(edgesType.isTupleType() && edgesType.getArity() == 3, "The edges data set (for edges with edge values) must consist of 3-tuples.");
TupleTypeInfo> tupleInfo = (TupleTypeInfo>) edgesType;
Validate.isTrue(tupleInfo.getTypeAt(0).equals(tupleInfo.getTypeAt(1))
&& Comparable.class.isAssignableFrom(tupleInfo.getTypeAt(0).getTypeClass()),
"The first two tuple fields (source and target vertex id) must be of the data type that represents the vertex key and implement the java.lang.Comparable interface.");
this.updateFunction = uf;
this.messagingFunction = mf;
this.edgesWithoutValue = null;
this.edgesWithValue = edgesWithValue;
this.maximumNumberOfIterations = maximumNumberOfIterations;
this.aggregators = new HashMap>>();
this.messageType = getMessageType(mf);
}
private TypeInformation getMessageType(MessagingFunction mf) {
return TypeExtractor.createTypeInfo(MessagingFunction.class, mf.getClass(), 2, null, null);
}
/**
* Registers a new aggregator. Aggregators registered here are available during the execution of the vertex updates
* via {@link VertexUpdateFunction#getIterationAggregator(String)} and
* {@link VertexUpdateFunction#getPreviousIterationAggregate(String)}.
*
* @param name The name of the aggregator, used to retrieve it and its aggregates during execution.
* @param aggregator The aggregator.
*/
public void registerAggregator(String name, Class extends Aggregator>> aggregator) {
this.aggregators.put(name, aggregator);
}
// --------------------------------------------------------------------------------------------
// Custom Operator behavior
// --------------------------------------------------------------------------------------------
/**
* Sets the input data set for this operator. In the case of this operator this input data set represents
* the set of vertices with their initial state.
*
* @param inputData The input data set, which in the case of this operator represents the set of
* vertices with their initial state.
*
* @see eu.stratosphere.api.java.operators.CustomUnaryOperation#setInput(eu.stratosphere.api.java.DataSet)
*/
@Override
public void setInput(DataSet> inputData) {
// sanity check that we really have two tuples
TypeInformation> inputType = inputData.getType();
Validate.isTrue(inputType.isTupleType() && inputType.getArity() == 2, "The input data set (the initial vertices) must consist of 2-tuples.");
// check that the key type here is the same as for the edges
TypeInformation keyType = ((TupleTypeInfo>) inputType).getTypeAt(0);
TypeInformation> edgeType = edgesWithoutValue != null ? edgesWithoutValue.getType() : edgesWithValue.getType();
TypeInformation edgeKeyType = ((TupleTypeInfo>) edgeType).getTypeAt(0);
Validate.isTrue(keyType.equals(edgeKeyType), "The first tuple field (the vertex id) of the input data set (the initial vertices) " +
"must be the same data type as the first fields of the edge data set (the source vertex id). " +
"Here, the key type for the vertex ids is '%s' and the key type for the edges is '%s'.", keyType, edgeKeyType);
this.initialVertices = inputData;
}
/**
* Creates the operator that represents this vertex-centric graph computation.
*
* @return The operator that represents this vertex-centric graph computation.
*/
@Override
public GraphIterationOperator createOperator() {
VertexUpdateUdf updateUdf = new VertexUpdateUdf(updateFunction);
if (edgesWithoutValue != null) {
// edges have no values
MessagingUdfNoEdgeValues messenger = new MessagingUdfNoEdgeValues(messagingFunction);
return new GraphIterationOperator>(
initialVertices, edgesWithoutValue, updateUdf, messenger, messageType, aggregators, maximumNumberOfIterations);
}
else {
// edges have values
// edges have no values
MessagingUdfWithEdgeValues messenger = new MessagingUdfWithEdgeValues(messagingFunction);
return new GraphIterationOperator>(
initialVertices, edgesWithValue, updateUdf, messenger, messageType, aggregators, maximumNumberOfIterations);
}
}
// --------------------------------------------------------------------------------------------
// Constructor builders to avoid signature conflicts with generic type erasure
// --------------------------------------------------------------------------------------------
/**
* Creates a new vertex-centric iteration operator for graphs where the edges are not associated with a value.
*
* @param edgesWithoutValue The data set containing edges. Edges are represented as 2-tuples: (source-id, target-id)
* @param vertexUpdateFunction The function that updates the state of the vertices from the incoming messages.
* @param messagingFunction The function that turns changed vertex states into messages along the edges.
*
* @param The type of the vertex key (the vertex identifier).
* @param The type of the vertex value (the state of the vertex).
* @param The type of the message sent between vertices along the edges.
*
* @return An in stance of the vertex-centric graph computation operator.
*/
public static final , VertexValue, Message>
VertexCentricIteration withPlainEdges(
DataSet> edgesWithoutValue,
VertexUpdateFunction vertexUpdateFunction,
MessagingFunction messagingFunction,
int maximumNumberOfIterations)
{
@SuppressWarnings("unchecked")
MessagingFunction tmf =
(MessagingFunction) messagingFunction;
return new VertexCentricIteration(vertexUpdateFunction, tmf, edgesWithoutValue, maximumNumberOfIterations);
}
/**
* Creates a new vertex-centric iteration operator for graphs where the edges are associated with a value (such as
* a weight or distance).
*
* @param edgesWithValue The data set containing edges. Edges are represented as 2-tuples: (source-id, target-id)
* @param uf The function that updates the state of the vertices from the incoming messages.
* @param mf The function that turns changed vertex states into messages along the edges.
*
* @param The type of the vertex key (the vertex identifier).
* @param The type of the vertex value (the state of the vertex).
* @param The type of the message sent between vertices along the edges.
* @param The type of the values that are associated with the edges.
*
* @return An in stance of the vertex-centric graph computation operator.
*/
public static final , VertexValue, Message, EdgeValue>
VertexCentricIteration withValuedEdges(
DataSet> edgesWithValue,
VertexUpdateFunction uf,
MessagingFunction mf,
int maximumNumberOfIterations)
{
return new VertexCentricIteration(uf, mf, edgesWithValue, maximumNumberOfIterations, true);
}
// --------------------------------------------------------------------------------------------
// Wrapping UDFs
// --------------------------------------------------------------------------------------------
private static final class VertexUpdateUdf, VertexValue, Message>
extends CoGroupFunction, Tuple2, Tuple2>
{
private static final long serialVersionUID = 1L;
private final VertexUpdateFunction vertexUpdateFunction;
private final MessageIterator messageIter = new MessageIterator();
private VertexUpdateUdf(VertexUpdateFunction vertexUpdateFunction) {
this.vertexUpdateFunction = vertexUpdateFunction;
}
@Override
public void coGroup(Iterator> messages, Iterator> vertex,
Collector> out)
throws Exception
{
if (vertex.hasNext()) {
Tuple2 vertexState = vertex.next();
@SuppressWarnings("unchecked")
Iterator> downcastIter = (Iterator>) (Iterator>) messages;
messageIter.setSource(downcastIter);
vertexUpdateFunction.setOutput(vertexState, out);
vertexUpdateFunction.updateVertex(vertexState.f0, vertexState.f1, messageIter);
} else {
if (messages.hasNext()) {
String message = "Target vertex does not exist!.";
try {
Tuple2 next = messages.next();
message = "Target vertex '" + next.f0 + "' does not exist!.";
} catch (Throwable t) {}
throw new Exception(message);
} else {
throw new Exception();
}
}
}
@Override
public void open(Configuration parameters) throws Exception {
if (getIterationRuntimeContext().getSuperstepNumber() == 1) {
this.vertexUpdateFunction.init(getIterationRuntimeContext());
}
this.vertexUpdateFunction.preSuperstep();
}
@Override
public void close() throws Exception {
this.vertexUpdateFunction.postSuperstep();
}
}
/*
* UDF that encapsulates the message sending function for graphs where the edges have no associated values.
*/
private static final class MessagingUdfNoEdgeValues, VertexValue, Message>
extends CoGroupFunction, Tuple2, Tuple2>
{
private static final long serialVersionUID = 1L;
private final MessagingFunction messagingFunction;
private MessagingUdfNoEdgeValues(MessagingFunction messagingFunction) {
this.messagingFunction = messagingFunction;
}
@Override
public void coGroup(Iterator> edges,
Iterator> state, Collector> out)
throws Exception
{
if (state.hasNext()) {
Tuple2 newVertexState = state.next();
messagingFunction.set((Iterator>) edges, out);
messagingFunction.sendMessages(newVertexState.f0, newVertexState.f1);
}
}
@Override
public void open(Configuration parameters) throws Exception {
if (getIterationRuntimeContext().getSuperstepNumber() == 1) {
this.messagingFunction.init(getIterationRuntimeContext(), false);
}
this.messagingFunction.preSuperstep();
}
@Override
public void close() throws Exception {
this.messagingFunction.postSuperstep();
}
}
/*
* UDF that encapsulates the message sending function for graphs where the edges have an associated value.
*/
private static final class MessagingUdfWithEdgeValues, VertexValue, Message, EdgeValue>
extends CoGroupFunction, Tuple2, Tuple2>
{
private static final long serialVersionUID = 1L;
private final MessagingFunction messagingFunction;
private MessagingUdfWithEdgeValues(MessagingFunction messagingFunction) {
this.messagingFunction = messagingFunction;
}
@Override
public void coGroup(Iterator> edges,
Iterator> state, Collector> out)
throws Exception
{
if (state.hasNext()) {
Tuple2 newVertexState = state.next();
messagingFunction.set((Iterator>) edges, out);
messagingFunction.sendMessages(newVertexState.f0, newVertexState.f1);
}
}
@Override
public void open(Configuration parameters) throws Exception {
if (getIterationRuntimeContext().getSuperstepNumber() == 1) {
this.messagingFunction.init(getIterationRuntimeContext(), true);
}
this.messagingFunction.preSuperstep();
}
@Override
public void close() throws Exception {
this.messagingFunction.postSuperstep();
}
}
// --------------------------------------------------------------------------------------------
// The data flow operator
// --------------------------------------------------------------------------------------------
/*
* The data flow operator. It presents itself to the outside as a two-input operator with inputs vertices and edges.
* Internally, it create a delta iteration node, which is also a two input operator. But the delta iteration
* node uses the vertex input (the first) for both inputs (solution set and initial workset) and uses the
* second input (edges) as a data source inside the iteration.
*/
private static final class GraphIterationOperator, VertexValue, Message, EdgeType extends Tuple> extends
TwoInputOperator, EdgeType, Tuple2, GraphIterationOperator>
{
private final DataSet edges;
private final CoGroupFunction, Tuple2, Tuple2> updateFunction;
private final CoGroupFunction, Tuple2> messagingFunction;
private final TypeInformation> messageType;
private final Map>> aggregators;
private final int maximumNumberOfIterations;
private GraphIterationOperator(DataSet> initialVertices,
DataSet edges,
VertexUpdateUdf updateFunction,
CoGroupFunction, Tuple2> messagingFunction,
TypeInformation messageType,
Map>> aggregators,
int maximumNumberOfIterations)
{
super(initialVertices, edges, initialVertices.getType());
this.edges = edges;
this.updateFunction = updateFunction;
this.messagingFunction = messagingFunction;
this.aggregators = aggregators;
this.maximumNumberOfIterations = maximumNumberOfIterations;
// construct the type for the messages between the messaging function and the vertex update function
TypeInformation keyType = ((TupleTypeInfo>) initialVertices.getType()).getTypeAt(0);
this.messageType = new TupleTypeInfo>(keyType, messageType);
}
@Override
protected eu.stratosphere.api.common.operators.DualInputOperator, Tuple2, Tuple2, ?> translateToDataFlow(Operator> input1, Operator input2) {
final String name = (getName() != null) ? getName() :
"Vertex-centric iteration (" + updateFunction + " | " + messagingFunction + ")";
final int[] zeroKeyPos = new int[] {0};
final DeltaIterationBase, Tuple2> iteration =
new DeltaIterationBase, Tuple2>(
new BinaryOperatorInformation, Tuple2, Tuple2>(getInput1Type(), getInput1Type(), getInput1Type()),
zeroKeyPos, name);
iteration.setMaximumNumberOfIterations(maximumNumberOfIterations);
for (Map.Entry>> entry : aggregators.entrySet()) {
iteration.getAggregators().registerAggregator(entry.getKey(), entry.getValue());
}
final CoGroupOperatorBase, Tuple2, GenericCoGrouper, Tuple2>> messenger =
new CoGroupOperatorBase, Tuple2, GenericCoGrouper, Tuple2>>(
messagingFunction, new BinaryOperatorInformation, Tuple2>(edges.getType(), getInput1Type(), messageType), zeroKeyPos, zeroKeyPos, "Messaging");
messenger.setSecondInput(iteration.getWorkset());
final CoGroupOperatorBase, Tuple2, Tuple2, GenericCoGrouper, Tuple2, Tuple2>> updater =
new CoGroupOperatorBase, Tuple2, Tuple2, GenericCoGrouper, Tuple2, Tuple2>>(
updateFunction, new BinaryOperatorInformation, Tuple2, Tuple2>(messageType, getInput1Type(), getInput1Type()), zeroKeyPos, zeroKeyPos, "Vertex State Updates");
updater.setFirstInput(messenger);
updater.setSecondInput(iteration.getSolutionSet());
// let the operator know that we preserve the key field
DualInputSemanticProperties semanticProps = new DualInputSemanticProperties();
semanticProps.addForwardedField1(0, 0);
semanticProps.addForwardedField2(0, 0);
updater.setSemanticProperties(semanticProps);
iteration.setSolutionSetDelta(updater);
iteration.setNextWorkset(updater);
// set inputs
iteration.setFirstInput(input1);
iteration.setSecondInput(input1);
messenger.setFirstInput(input2);
return iteration;
}
}
}