All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.bespin.java.mapreduce.pagerank.SequentialPersonalizedPageRank Maven / Gradle / Ivy

The newest version!
/**
 * Bespin: reference implementations of "big data" algorithms
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.bespin.java.mapreduce.pagerank;

import com.google.common.base.Function;
import edu.uci.ics.jung.algorithms.cluster.WeakComponentClusterer;
import edu.uci.ics.jung.algorithms.importance.Ranking;
import edu.uci.ics.jung.algorithms.scoring.PageRankWithPriors;
import edu.uci.ics.jung.graph.DirectedSparseGraph;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.kohsuke.args4j.ParserProperties;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.PriorityQueue;
import java.util.Set;

/**
 * 

* Program that computes personalized PageRank for a graph using the JUNG package. Program takes two command-line * arguments: the first is a file containing the graph data, and the second is the random jump * factor (a typical setting is 0.15). *

* *

* The graph should be represented as an adjacency list. Each line should have at least one token; * tokens should be tab delimited. The first token represents the unique id of the source node; * subsequent tokens represent its link targets (i.e., outlinks from the source node). For * completeness, there should be a line representing all nodes, even nodes without outlinks (those * lines will simply contain one token, the source node id). *

* * @author Jimmy Lin */ public class SequentialPersonalizedPageRank { private SequentialPersonalizedPageRank() {} private static final class Args { @Option(name = "-input", metaVar = "[path]", required = true, usage = "input path") String input; @Option(name = "-source", metaVar = "[node]", required = true, usage = "source node (i.e., destination of the random jump)") String source; @Option(name = "-jump", metaVar = "[num]", usage = "random jump factor") float alpha = 0.15f; } public static void main(String[] argv) throws IOException { final Args args = new Args(); CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(100)); try { parser.parseArgument(argv); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); System.exit(-1); } int edgeCnt = 0; DirectedSparseGraph graph = new DirectedSparseGraph<>(); BufferedReader data = new BufferedReader(new InputStreamReader(new FileInputStream(args.input))); String line; while ((line = data.readLine()) != null) { String[] arr = line.split("\\t"); for (int i = 1; i < arr.length; i++) { graph.addEdge(edgeCnt++, arr[0], arr[i]); } } data.close(); if (!graph.containsVertex(args.source)) { System.err.println("Error: source node not found in the graph!"); System.exit(-1); } WeakComponentClusterer clusterer = new WeakComponentClusterer<>(); Set> components = clusterer.apply(graph); System.out.println("Number of components: " + components.size()); System.out.println("Number of edges: " + graph.getEdgeCount()); System.out.println("Number of nodes: " + graph.getVertexCount()); System.out.println("Random jump factor: " + args.alpha); // Compute personalized PageRank. PageRankWithPriors ranker = new PageRankWithPriors<>(graph, new Function() { @Override public Double apply(String vertex) { return vertex.equals(args.source) ? 1.0 : 0; } }, args.alpha); // Note that the Altiscale cluster is still on Java 7, so we don't have lambdas. ranker.evaluate(); // Use priority queue to sort vertices by PageRank values. PriorityQueue> q = new PriorityQueue<>(); int i = 0; for (String pmid : graph.getVertices()) { q.add(new Ranking<>(i++, ranker.getVertexScore(pmid), pmid)); } // Print PageRank values. System.out.println("\nPageRank of nodes, in descending order:"); Ranking r; while ((r = q.poll()) != null) { System.out.println(String.format("%.5f\t%s", r.rankScore, r.getRanked())); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy