eu.stratosphere.example.java.graph.PageRankBasic Maven / Gradle / Ivy
/***********************************************************************************************************************
*
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*
**********************************************************************************************************************/
package eu.stratosphere.example.java.graph;
import static eu.stratosphere.api.java.aggregation.Aggregations.SUM;
import java.util.ArrayList;
import java.util.Iterator;
import eu.stratosphere.api.java.DataSet;
import eu.stratosphere.api.java.ExecutionEnvironment;
import eu.stratosphere.api.java.IterativeDataSet;
import eu.stratosphere.api.java.functions.FilterFunction;
import eu.stratosphere.api.java.functions.FlatMapFunction;
import eu.stratosphere.api.java.functions.GroupReduceFunction;
import eu.stratosphere.api.java.functions.MapFunction;
import eu.stratosphere.api.java.tuple.Tuple2;
import eu.stratosphere.example.java.graph.util.PageRankData;
import eu.stratosphere.util.Collector;
/**
* A basic implementation of the Page Rank algorithm using a bulk iteration.
*
*
* This implementation requires a set of pages (vertices) with associated ranks and a set
* of directed links (edges) as input and works as follows.
* In each iteration, the rank of every page is evenly distributed to all pages it points to.
* Each page collects the partial ranks of all pages that point to it, sums them up, and applies a dampening factor to the sum.
* The result is the new rank of the page. A new iteration is started with the new ranks of all pages.
* This implementation terminates after a fixed number of iterations.
* This is the Wikipedia entry for the Page Rank algorithm.
*
*
* Input files are plain text files and must be formatted as follows:
*
* - Pages represented as an (long) ID and a (double) rank separated by new-line characters.
* For example "1 0.4\n2 0.3\n12 0.15\n42 0.05\n63 0.1\n" gives five pages with associated ranks
* (1, 0.4), (2, 0.3), (12, 0.15), (42, 0.05), and (63, 0.1). Ranks should sum up to 1.0.
* - Page links are represented as pairs of page IDs which are separated by space
* characters. Edges are separated by new-line characters.
* For example "1 2\n2 12\n1 12\n42 63\n" gives four (directed) edges (1)-(2), (2)-(12), (1)-(12), and (42)-(63).
* For this simple implementation it is required that each page has at least one incoming and one outgoing link (a page can point to itself).
*
*
*
* This example shows how to use:
*
* - Bulk Iterations
*
- Default Join
*
*
*
*/
@SuppressWarnings("serial")
public class PageRankBasic {
private static final double DAMPENING_FACTOR = 0.85;
private static final double EPSILON = 0.0001;
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
parseParameters(args);
// set up execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// get input data
DataSet> pageWithRankInput = getPageWithRankDataSet(env);
DataSet> edgeInput = getEdgeDataSet(env);
// build adjecency list from edge input
DataSet> adjacencyListInput =
edgeInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList());
// set iterative data set
IterativeDataSet> iteration = pageWithRankInput.iterate(maxIterations);
DataSet> newRanks = iteration
// join pages with outgoing edges and distribute rank
.join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch())
// collect and sum ranks
.groupBy(0).aggregate(SUM, 1)
// apply dampening factor
.map(new Dampener(numVertices));
DataSet> finalPageRanks = iteration.closeWith(
newRanks,
newRanks.join(iteration).where(0).equalTo(0)
// termination condition
.filter(new EpsilonFilter()));
// emit result
if(fileOutput) {
finalPageRanks.writeAsCsv(outputPath, "\n", " ");
} else {
finalPageRanks.print();
}
// execute program
env.execute("Basic Page Rank Example");
}
// *************************************************************************
// USER FUNCTIONS
// *************************************************************************
/**
* A reduce function that takes a sequence of edges and builds the adjacency list for the vertex where the edges
* originate. Run as a preprocessing step.
*/
public static final class BuildOutgoingEdgeList extends GroupReduceFunction, Tuple2> {
private final ArrayList neighbors = new ArrayList();
@Override
public void reduce(Iterator> values, Collector> out) {
neighbors.clear();
Long id = 0L;
while (values.hasNext()) {
Tuple2 n = values.next();
id = n.f0;
neighbors.add(n.f1);
}
out.collect(new Tuple2(id, neighbors.toArray(new Long[neighbors.size()])));
}
}
/**
* Join function that distributes a fraction of a vertex's rank to all neighbors.
*/
public static final class JoinVertexWithEdgesMatch extends FlatMapFunction, Tuple2>, Tuple2> {
@Override
public void flatMap(Tuple2, Tuple2> value, Collector> out){
Long[] neigbors = value.f1.f1;
double rank = value.f0.f1;
double rankToDistribute = rank / ((double) neigbors.length);
for (int i = 0; i < neigbors.length; i++) {
out.collect(new Tuple2(neigbors[i], rankToDistribute));
}
}
}
/**
* The function that applies the page rank dampening formula
*/
public static final class Dampener extends MapFunction, Tuple2> {
private final double numVertices;
public Dampener(double numVertices) {
this.numVertices = numVertices;
}
@Override
public Tuple2 map(Tuple2 value) {
value.f1 = DAMPENING_FACTOR*value.f1 + (1-DAMPENING_FACTOR)/numVertices;
return value;
}
}
/**
* Filter that filters vertices where the rank difference is below a threshold.
*/
public static final class EpsilonFilter extends FilterFunction, Tuple2>> {
@Override
public boolean filter(Tuple2, Tuple2> value) {
return Math.abs(value.f0.f1 - value.f1.f1) > EPSILON;
}
}
// *************************************************************************
// UTIL METHODS
// *************************************************************************
private static boolean fileOutput = false;
private static String pageWithRankInputPath = null;
private static String edgeInputPath = null;
private static String outputPath = null;
private static int numVertices = 0;
private static int maxIterations = 10;
private static void parseParameters(String[] args) {
if(args.length > 0) {
if(args.length == 5) {
fileOutput = true;
pageWithRankInputPath = args[0];
edgeInputPath = args[1];
outputPath = args[2];
numVertices = Integer.parseInt(args[3]);
maxIterations = Integer.parseInt(args[4]);
} else {
System.err.println("Usage: PageRankBasic © 2015 - 2025 Weber Informatics LLC | Privacy Policy