All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.stratosphere.example.java.graph.PageRankBasic Maven / Gradle / Ivy

/***********************************************************************************************************************
 *
 * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 *
 **********************************************************************************************************************/
package eu.stratosphere.example.java.graph;

import static eu.stratosphere.api.java.aggregation.Aggregations.SUM;

import java.util.ArrayList;
import java.util.Iterator;

import eu.stratosphere.api.java.DataSet;
import eu.stratosphere.api.java.ExecutionEnvironment;
import eu.stratosphere.api.java.IterativeDataSet;
import eu.stratosphere.api.java.functions.FilterFunction;
import eu.stratosphere.api.java.functions.FlatMapFunction;
import eu.stratosphere.api.java.functions.GroupReduceFunction;
import eu.stratosphere.api.java.functions.MapFunction;
import eu.stratosphere.api.java.tuple.Tuple2;
import eu.stratosphere.example.java.graph.util.PageRankData;
import eu.stratosphere.util.Collector;

/**
 * A basic implementation of the Page Rank algorithm using a bulk iteration.
 * 
 * 

* This implementation requires a set of pages (vertices) with associated ranks and a set * of directed links (edges) as input and works as follows.
* In each iteration, the rank of every page is evenly distributed to all pages it points to. * Each page collects the partial ranks of all pages that point to it, sums them up, and applies a dampening factor to the sum. * The result is the new rank of the page. A new iteration is started with the new ranks of all pages. * This implementation terminates after a fixed number of iterations.
* This is the Wikipedia entry for the Page Rank algorithm. * *

* Input files are plain text files and must be formatted as follows: *

    *
  • Pages represented as an (long) ID and a (double) rank separated by new-line characters.
    * For example "1 0.4\n2 0.3\n12 0.15\n42 0.05\n63 0.1\n" gives five pages with associated ranks * (1, 0.4), (2, 0.3), (12, 0.15), (42, 0.05), and (63, 0.1). Ranks should sum up to 1.0. *
  • Page links are represented as pairs of page IDs which are separated by space * characters. Edges are separated by new-line characters.
    * For example "1 2\n2 12\n1 12\n42 63\n" gives four (directed) edges (1)-(2), (2)-(12), (1)-(12), and (42)-(63). * For this simple implementation it is required that each page has at least one incoming and one outgoing link (a page can point to itself). *
* *

* This example shows how to use: *

    *
  • Bulk Iterations *
  • Default Join *
* * */ @SuppressWarnings("serial") public class PageRankBasic { private static final double DAMPENING_FACTOR = 0.85; private static final double EPSILON = 0.0001; // ************************************************************************* // PROGRAM // ************************************************************************* public static void main(String[] args) throws Exception { parseParameters(args); // set up execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // get input data DataSet> pageWithRankInput = getPageWithRankDataSet(env); DataSet> edgeInput = getEdgeDataSet(env); // build adjecency list from edge input DataSet> adjacencyListInput = edgeInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList()); // set iterative data set IterativeDataSet> iteration = pageWithRankInput.iterate(maxIterations); DataSet> newRanks = iteration // join pages with outgoing edges and distribute rank .join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch()) // collect and sum ranks .groupBy(0).aggregate(SUM, 1) // apply dampening factor .map(new Dampener(numVertices)); DataSet> finalPageRanks = iteration.closeWith( newRanks, newRanks.join(iteration).where(0).equalTo(0) // termination condition .filter(new EpsilonFilter())); // emit result if(fileOutput) { finalPageRanks.writeAsCsv(outputPath, "\n", " "); } else { finalPageRanks.print(); } // execute program env.execute("Basic Page Rank Example"); } // ************************************************************************* // USER FUNCTIONS // ************************************************************************* /** * A reduce function that takes a sequence of edges and builds the adjacency list for the vertex where the edges * originate. Run as a preprocessing step. */ public static final class BuildOutgoingEdgeList extends GroupReduceFunction, Tuple2> { private final ArrayList neighbors = new ArrayList(); @Override public void reduce(Iterator> values, Collector> out) { neighbors.clear(); Long id = 0L; while (values.hasNext()) { Tuple2 n = values.next(); id = n.f0; neighbors.add(n.f1); } out.collect(new Tuple2(id, neighbors.toArray(new Long[neighbors.size()]))); } } /** * Join function that distributes a fraction of a vertex's rank to all neighbors. */ public static final class JoinVertexWithEdgesMatch extends FlatMapFunction, Tuple2>, Tuple2> { @Override public void flatMap(Tuple2, Tuple2> value, Collector> out){ Long[] neigbors = value.f1.f1; double rank = value.f0.f1; double rankToDistribute = rank / ((double) neigbors.length); for (int i = 0; i < neigbors.length; i++) { out.collect(new Tuple2(neigbors[i], rankToDistribute)); } } } /** * The function that applies the page rank dampening formula */ public static final class Dampener extends MapFunction, Tuple2> { private final double numVertices; public Dampener(double numVertices) { this.numVertices = numVertices; } @Override public Tuple2 map(Tuple2 value) { value.f1 = DAMPENING_FACTOR*value.f1 + (1-DAMPENING_FACTOR)/numVertices; return value; } } /** * Filter that filters vertices where the rank difference is below a threshold. */ public static final class EpsilonFilter extends FilterFunction, Tuple2>> { @Override public boolean filter(Tuple2, Tuple2> value) { return Math.abs(value.f0.f1 - value.f1.f1) > EPSILON; } } // ************************************************************************* // UTIL METHODS // ************************************************************************* private static boolean fileOutput = false; private static String pageWithRankInputPath = null; private static String edgeInputPath = null; private static String outputPath = null; private static int numVertices = 0; private static int maxIterations = 10; private static void parseParameters(String[] args) { if(args.length > 0) { if(args.length == 5) { fileOutput = true; pageWithRankInputPath = args[0]; edgeInputPath = args[1]; outputPath = args[2]; numVertices = Integer.parseInt(args[3]); maxIterations = Integer.parseInt(args[4]); } else { System.err.println("Usage: PageRankBasic "); System.exit(1); } } else { System.out.println("Executing PageRank Basic example with default parameters and built-in default data."); System.out.println(" Provide parameters to read input data from files."); System.out.println(" See the documentation for the correct format of input files."); System.out.println(" Usage: PageRankBasic "); numVertices = PageRankData.getNumberOfPages(); } } private static DataSet> getPageWithRankDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(pageWithRankInputPath) .fieldDelimiter(' ') .lineDelimiter("\n") .types(Long.class, Double.class); } else { return PageRankData.getDefaultPageWithRankDataSet(env); } } private static DataSet> getEdgeDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(edgeInputPath) .fieldDelimiter(' ') .lineDelimiter("\n") .types(Long.class, Long.class); } else { return PageRankData.getDefaultEdgeDataSet(env); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy