All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.stratosphere.spargel.java.VertexCentricIteration Maven / Gradle / Ivy

/***********************************************************************************************************************
 * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 **********************************************************************************************************************/
package eu.stratosphere.spargel.java;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import org.apache.commons.lang3.Validate;

import eu.stratosphere.api.common.aggregators.Aggregator;
import eu.stratosphere.api.common.functions.GenericCoGrouper;
import eu.stratosphere.api.common.operators.BinaryOperatorInformation;
import eu.stratosphere.api.common.operators.DualInputSemanticProperties;
import eu.stratosphere.api.common.operators.Operator;
import eu.stratosphere.api.java.DataSet;
import eu.stratosphere.api.java.functions.CoGroupFunction;
import eu.stratosphere.api.java.operators.CustomUnaryOperation;
import eu.stratosphere.api.java.operators.TwoInputOperator;
import eu.stratosphere.api.common.operators.base.CoGroupOperatorBase;
import eu.stratosphere.api.common.operators.base.DeltaIterationBase;
import eu.stratosphere.api.java.tuple.Tuple;
import eu.stratosphere.api.java.tuple.Tuple2;
import eu.stratosphere.api.java.tuple.Tuple3;
import eu.stratosphere.api.java.typeutils.TupleTypeInfo;
import eu.stratosphere.api.java.typeutils.TypeExtractor;
import eu.stratosphere.types.TypeInformation;
import eu.stratosphere.configuration.Configuration;
import eu.stratosphere.util.Collector;

/**
 * This class represents iterative graph computations, programmed in a vertex-centric perspective.
 * It is a special case of Bulk Synchronous Parallel computation. The paradigm has also been
 * implemented by Google's Pregel system and by Apache Giraph.
 * 

* Vertex centric algorithms operate on graphs, which are defined through vertices and edges. The * algorithms send messages along the edges and update the state of vertices based on * the old state and the incoming messages. All vertices have an initial state. * The computation terminates once no vertex updates it state any more. * Additionally, a maximum number of iterations (supersteps) may be specified. *

* The computation is here represented by two functions: *

    *
  • The {@link VertexUpdateFunction} receives incoming messages and may updates the state for * the vertex. If a state is updated, messages are sent from this vertex. Initially, all vertices are * considered updated.
  • *
  • The {@link MessagingFunction} takes the new vertex state and sends messages along the outgoing * edges of the vertex. The outgoing edges may optionally have an associated value, such as a weight.
  • *
*

* Vertex-centric graph iterations are instantiated by the * {@link #withPlainEdges(DataSet, VertexUpdateFunction, MessagingFunction, int)} method, or the * {@link #withValuedEdges(DataSet, VertexUpdateFunction, MessagingFunction, int)} method, depending on whether * the graph's edges are carrying values. * * @param The type of the vertex key (the vertex identifier). * @param The type of the vertex value (the state of the vertex). * @param The type of the message sent between vertices along the edges. * @param The type of the values that are associated with the edges. */ public class VertexCentricIteration, VertexValue, Message, EdgeValue> implements CustomUnaryOperation, Tuple2> { private final VertexUpdateFunction updateFunction; private final MessagingFunction messagingFunction; private final DataSet> edgesWithoutValue; private final DataSet> edgesWithValue; private final TypeInformation messageType; private final Map>> aggregators; private final int maximumNumberOfIterations; private DataSet> initialVertices; // ---------------------------------------------------------------------------------- private VertexCentricIteration(VertexUpdateFunction uf, MessagingFunction mf, DataSet> edgesWithoutValue, int maximumNumberOfIterations) { // check that the edges are actually a valid tuple set of vertex key types TypeInformation> edgesType = edgesWithoutValue.getType(); Validate.isTrue(edgesType.isTupleType() && edgesType.getArity() == 2, "The edges data set (for edges without edge values) must consist of 2-tuples."); TupleTypeInfo tupleInfo = (TupleTypeInfo) edgesType; Validate.isTrue(tupleInfo.getTypeAt(0).equals(tupleInfo.getTypeAt(1)) && Comparable.class.isAssignableFrom(tupleInfo.getTypeAt(0).getTypeClass()), "Both tuple fields (source and target vertex id) must be of the data type that represents the vertex key and implement the java.lang.Comparable interface."); this.updateFunction = uf; this.messagingFunction = mf; this.edgesWithoutValue = edgesWithoutValue; this.edgesWithValue = null; this.maximumNumberOfIterations = maximumNumberOfIterations; this.aggregators = new HashMap>>(); this.messageType = getMessageType(mf); } private VertexCentricIteration(VertexUpdateFunction uf, MessagingFunction mf, DataSet> edgesWithValue, int maximumNumberOfIterations, boolean edgeHasValueMarker) { // check that the edges are actually a valid tuple set of vertex key types TypeInformation> edgesType = edgesWithValue.getType(); Validate.isTrue(edgesType.isTupleType() && edgesType.getArity() == 3, "The edges data set (for edges with edge values) must consist of 3-tuples."); TupleTypeInfo tupleInfo = (TupleTypeInfo) edgesType; Validate.isTrue(tupleInfo.getTypeAt(0).equals(tupleInfo.getTypeAt(1)) && Comparable.class.isAssignableFrom(tupleInfo.getTypeAt(0).getTypeClass()), "The first two tuple fields (source and target vertex id) must be of the data type that represents the vertex key and implement the java.lang.Comparable interface."); this.updateFunction = uf; this.messagingFunction = mf; this.edgesWithoutValue = null; this.edgesWithValue = edgesWithValue; this.maximumNumberOfIterations = maximumNumberOfIterations; this.aggregators = new HashMap>>(); this.messageType = getMessageType(mf); } private TypeInformation getMessageType(MessagingFunction mf) { return TypeExtractor.createTypeInfo(MessagingFunction.class, mf.getClass(), 2, null, null); } /** * Registers a new aggregator. Aggregators registered here are available during the execution of the vertex updates * via {@link VertexUpdateFunction#getIterationAggregator(String)} and * {@link VertexUpdateFunction#getPreviousIterationAggregate(String)}. * * @param name The name of the aggregator, used to retrieve it and its aggregates during execution. * @param aggregator The aggregator. */ public void registerAggregator(String name, Class> aggregator) { this.aggregators.put(name, aggregator); } // -------------------------------------------------------------------------------------------- // Custom Operator behavior // -------------------------------------------------------------------------------------------- /** * Sets the input data set for this operator. In the case of this operator this input data set represents * the set of vertices with their initial state. * * @param inputData The input data set, which in the case of this operator represents the set of * vertices with their initial state. * * @see eu.stratosphere.api.java.operators.CustomUnaryOperation#setInput(eu.stratosphere.api.java.DataSet) */ @Override public void setInput(DataSet> inputData) { // sanity check that we really have two tuples TypeInformation> inputType = inputData.getType(); Validate.isTrue(inputType.isTupleType() && inputType.getArity() == 2, "The input data set (the initial vertices) must consist of 2-tuples."); // check that the key type here is the same as for the edges TypeInformation keyType = ((TupleTypeInfo) inputType).getTypeAt(0); TypeInformation edgeType = edgesWithoutValue != null ? edgesWithoutValue.getType() : edgesWithValue.getType(); TypeInformation edgeKeyType = ((TupleTypeInfo) edgeType).getTypeAt(0); Validate.isTrue(keyType.equals(edgeKeyType), "The first tuple field (the vertex id) of the input data set (the initial vertices) " + "must be the same data type as the first fields of the edge data set (the source vertex id). " + "Here, the key type for the vertex ids is '%s' and the key type for the edges is '%s'.", keyType, edgeKeyType); this.initialVertices = inputData; } /** * Creates the operator that represents this vertex-centric graph computation. * * @return The operator that represents this vertex-centric graph computation. */ @Override public GraphIterationOperator createOperator() { VertexUpdateUdf updateUdf = new VertexUpdateUdf(updateFunction); if (edgesWithoutValue != null) { // edges have no values MessagingUdfNoEdgeValues messenger = new MessagingUdfNoEdgeValues(messagingFunction); return new GraphIterationOperator>( initialVertices, edgesWithoutValue, updateUdf, messenger, messageType, aggregators, maximumNumberOfIterations); } else { // edges have values // edges have no values MessagingUdfWithEdgeValues messenger = new MessagingUdfWithEdgeValues(messagingFunction); return new GraphIterationOperator>( initialVertices, edgesWithValue, updateUdf, messenger, messageType, aggregators, maximumNumberOfIterations); } } // -------------------------------------------------------------------------------------------- // Constructor builders to avoid signature conflicts with generic type erasure // -------------------------------------------------------------------------------------------- /** * Creates a new vertex-centric iteration operator for graphs where the edges are not associated with a value. * * @param edgesWithoutValue The data set containing edges. Edges are represented as 2-tuples: (source-id, target-id) * @param vertexUpdateFunction The function that updates the state of the vertices from the incoming messages. * @param messagingFunction The function that turns changed vertex states into messages along the edges. * * @param The type of the vertex key (the vertex identifier). * @param The type of the vertex value (the state of the vertex). * @param The type of the message sent between vertices along the edges. * * @return An in stance of the vertex-centric graph computation operator. */ public static final , VertexValue, Message> VertexCentricIteration withPlainEdges( DataSet> edgesWithoutValue, VertexUpdateFunction vertexUpdateFunction, MessagingFunction messagingFunction, int maximumNumberOfIterations) { @SuppressWarnings("unchecked") MessagingFunction tmf = (MessagingFunction) messagingFunction; return new VertexCentricIteration(vertexUpdateFunction, tmf, edgesWithoutValue, maximumNumberOfIterations); } /** * Creates a new vertex-centric iteration operator for graphs where the edges are associated with a value (such as * a weight or distance). * * @param edgesWithValue The data set containing edges. Edges are represented as 2-tuples: (source-id, target-id) * @param uf The function that updates the state of the vertices from the incoming messages. * @param mf The function that turns changed vertex states into messages along the edges. * * @param The type of the vertex key (the vertex identifier). * @param The type of the vertex value (the state of the vertex). * @param The type of the message sent between vertices along the edges. * @param The type of the values that are associated with the edges. * * @return An in stance of the vertex-centric graph computation operator. */ public static final , VertexValue, Message, EdgeValue> VertexCentricIteration withValuedEdges( DataSet> edgesWithValue, VertexUpdateFunction uf, MessagingFunction mf, int maximumNumberOfIterations) { return new VertexCentricIteration(uf, mf, edgesWithValue, maximumNumberOfIterations, true); } // -------------------------------------------------------------------------------------------- // Wrapping UDFs // -------------------------------------------------------------------------------------------- private static final class VertexUpdateUdf, VertexValue, Message> extends CoGroupFunction, Tuple2, Tuple2> { private static final long serialVersionUID = 1L; private final VertexUpdateFunction vertexUpdateFunction; private final MessageIterator messageIter = new MessageIterator(); private VertexUpdateUdf(VertexUpdateFunction vertexUpdateFunction) { this.vertexUpdateFunction = vertexUpdateFunction; } @Override public void coGroup(Iterator> messages, Iterator> vertex, Collector> out) throws Exception { if (vertex.hasNext()) { Tuple2 vertexState = vertex.next(); @SuppressWarnings("unchecked") Iterator> downcastIter = (Iterator>) (Iterator) messages; messageIter.setSource(downcastIter); vertexUpdateFunction.setOutput(vertexState, out); vertexUpdateFunction.updateVertex(vertexState.f0, vertexState.f1, messageIter); } else { if (messages.hasNext()) { String message = "Target vertex does not exist!."; try { Tuple2 next = messages.next(); message = "Target vertex '" + next.f0 + "' does not exist!."; } catch (Throwable t) {} throw new Exception(message); } else { throw new Exception(); } } } @Override public void open(Configuration parameters) throws Exception { if (getIterationRuntimeContext().getSuperstepNumber() == 1) { this.vertexUpdateFunction.init(getIterationRuntimeContext()); } this.vertexUpdateFunction.preSuperstep(); } @Override public void close() throws Exception { this.vertexUpdateFunction.postSuperstep(); } } /* * UDF that encapsulates the message sending function for graphs where the edges have no associated values. */ private static final class MessagingUdfNoEdgeValues, VertexValue, Message> extends CoGroupFunction, Tuple2, Tuple2> { private static final long serialVersionUID = 1L; private final MessagingFunction messagingFunction; private MessagingUdfNoEdgeValues(MessagingFunction messagingFunction) { this.messagingFunction = messagingFunction; } @Override public void coGroup(Iterator> edges, Iterator> state, Collector> out) throws Exception { if (state.hasNext()) { Tuple2 newVertexState = state.next(); messagingFunction.set((Iterator) edges, out); messagingFunction.sendMessages(newVertexState.f0, newVertexState.f1); } } @Override public void open(Configuration parameters) throws Exception { if (getIterationRuntimeContext().getSuperstepNumber() == 1) { this.messagingFunction.init(getIterationRuntimeContext(), false); } this.messagingFunction.preSuperstep(); } @Override public void close() throws Exception { this.messagingFunction.postSuperstep(); } } /* * UDF that encapsulates the message sending function for graphs where the edges have an associated value. */ private static final class MessagingUdfWithEdgeValues, VertexValue, Message, EdgeValue> extends CoGroupFunction, Tuple2, Tuple2> { private static final long serialVersionUID = 1L; private final MessagingFunction messagingFunction; private MessagingUdfWithEdgeValues(MessagingFunction messagingFunction) { this.messagingFunction = messagingFunction; } @Override public void coGroup(Iterator> edges, Iterator> state, Collector> out) throws Exception { if (state.hasNext()) { Tuple2 newVertexState = state.next(); messagingFunction.set((Iterator) edges, out); messagingFunction.sendMessages(newVertexState.f0, newVertexState.f1); } } @Override public void open(Configuration parameters) throws Exception { if (getIterationRuntimeContext().getSuperstepNumber() == 1) { this.messagingFunction.init(getIterationRuntimeContext(), true); } this.messagingFunction.preSuperstep(); } @Override public void close() throws Exception { this.messagingFunction.postSuperstep(); } } // -------------------------------------------------------------------------------------------- // The data flow operator // -------------------------------------------------------------------------------------------- /* * The data flow operator. It presents itself to the outside as a two-input operator with inputs vertices and edges. * Internally, it create a delta iteration node, which is also a two input operator. But the delta iteration * node uses the vertex input (the first) for both inputs (solution set and initial workset) and uses the * second input (edges) as a data source inside the iteration. */ private static final class GraphIterationOperator, VertexValue, Message, EdgeType extends Tuple> extends TwoInputOperator, EdgeType, Tuple2, GraphIterationOperator> { private final DataSet edges; private final CoGroupFunction, Tuple2, Tuple2> updateFunction; private final CoGroupFunction, Tuple2> messagingFunction; private final TypeInformation> messageType; private final Map>> aggregators; private final int maximumNumberOfIterations; private GraphIterationOperator(DataSet> initialVertices, DataSet edges, VertexUpdateUdf updateFunction, CoGroupFunction, Tuple2> messagingFunction, TypeInformation messageType, Map>> aggregators, int maximumNumberOfIterations) { super(initialVertices, edges, initialVertices.getType()); this.edges = edges; this.updateFunction = updateFunction; this.messagingFunction = messagingFunction; this.aggregators = aggregators; this.maximumNumberOfIterations = maximumNumberOfIterations; // construct the type for the messages between the messaging function and the vertex update function TypeInformation keyType = ((TupleTypeInfo) initialVertices.getType()).getTypeAt(0); this.messageType = new TupleTypeInfo>(keyType, messageType); } @Override protected eu.stratosphere.api.common.operators.DualInputOperator, Tuple2, Tuple2, ?> translateToDataFlow(Operator> input1, Operator input2) { final String name = (getName() != null) ? getName() : "Vertex-centric iteration (" + updateFunction + " | " + messagingFunction + ")"; final int[] zeroKeyPos = new int[] {0}; final DeltaIterationBase, Tuple2> iteration = new DeltaIterationBase, Tuple2>( new BinaryOperatorInformation, Tuple2, Tuple2>(getInput1Type(), getInput1Type(), getInput1Type()), zeroKeyPos, name); iteration.setMaximumNumberOfIterations(maximumNumberOfIterations); for (Map.Entry>> entry : aggregators.entrySet()) { iteration.getAggregators().registerAggregator(entry.getKey(), entry.getValue()); } final CoGroupOperatorBase, Tuple2, GenericCoGrouper, Tuple2>> messenger = new CoGroupOperatorBase, Tuple2, GenericCoGrouper, Tuple2>>( messagingFunction, new BinaryOperatorInformation, Tuple2>(edges.getType(), getInput1Type(), messageType), zeroKeyPos, zeroKeyPos, "Messaging"); messenger.setSecondInput(iteration.getWorkset()); final CoGroupOperatorBase, Tuple2, Tuple2, GenericCoGrouper, Tuple2, Tuple2>> updater = new CoGroupOperatorBase, Tuple2, Tuple2, GenericCoGrouper, Tuple2, Tuple2>>( updateFunction, new BinaryOperatorInformation, Tuple2, Tuple2>(messageType, getInput1Type(), getInput1Type()), zeroKeyPos, zeroKeyPos, "Vertex State Updates"); updater.setFirstInput(messenger); updater.setSecondInput(iteration.getSolutionSet()); // let the operator know that we preserve the key field DualInputSemanticProperties semanticProps = new DualInputSemanticProperties(); semanticProps.addForwardedField1(0, 0); semanticProps.addForwardedField2(0, 0); updater.setSemanticProperties(semanticProps); iteration.setSolutionSetDelta(updater); iteration.setNextWorkset(updater); // set inputs iteration.setFirstInput(input1); iteration.setSecondInput(input1); messenger.setFirstInput(input2); return iteration; } } }