com.bigdata.rdf.graph.analytics.PR Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC 2006-2012. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.bigdata.rdf.graph.analytics;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.log4j.Logger;
import org.openrdf.model.Statement;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import com.bigdata.rdf.graph.BinderBase;
import com.bigdata.rdf.graph.EdgesEnum;
import com.bigdata.rdf.graph.Factory;
import com.bigdata.rdf.graph.FrontierEnum;
import com.bigdata.rdf.graph.IBinder;
import com.bigdata.rdf.graph.IBindingExtractor;
import com.bigdata.rdf.graph.IGASContext;
import com.bigdata.rdf.graph.IGASScheduler;
import com.bigdata.rdf.graph.IGASState;
import com.bigdata.rdf.graph.IReducer;
import com.bigdata.rdf.graph.impl.BaseGASProgram;
/**
* Page rank assigns weights to the vertices in a graph based by on the relative
* "importance" as determined by the patterns of directed links in the graph.
* The algorithm is given stated in terms of a computation that is related until
* the delta in the computed values for the vertices is within epsilon of
* ZERO. However, in practice convergence is based on epsilon is
* problematic due to the manner in which the results of the floating point
* operations depend on the sequence of those operations (which is why this
* implementation uses double
precision). Thus, page rank is
* typically executed a specific number of iterations, e.g., 50 or 100. If
* convergence is based on epsilon, then it is possible that the computation
* will never converge, especially for smaller values of epsilon.
*
* - init
* - All vertices are inserted into the initial frontier.
* - Gather
* - sum( neighbor_value / neighbor_num_out_edges ) over the in-edges of the
* graph.
* - Apply
* - value = resetProb + (1.0 - resetProb) * gatherSum
* - Scatter
* - if (a) value has significantly changed
(fabs(old-new) GT
* epsilon)
; or (b) iterations LT limit
*
*
* - where resetProb is a value that determines a random reset
* probability and defaults to {@value PR#DEFAULT_RESET_PROB}.
* -
* where epsilon controls the degree of convergence before the algorithm
* terminates and defaults to {@value PR#DEFAULT_EPSILON}.
*
*
* FIXME PR UNIT TEST. Verify computed values (within variance) and max
* iterations. Ground truth from GL?
*
* FIXME PR: The out-edges can be taken directly from the vertex distribution.
* That will reduce the initialization overhead for PR.
*
* @author Bryan Thompson
*/
public class PR extends BaseGASProgram {
private static final Logger log = Logger.getLogger(PR.class);
// TOOD javadoc and config.
protected static final int DEFAULT_LIMIT = 100;
protected static final double DEFAULT_RESET_PROB = 0.15d;
protected static final double DEFAULT_EPSILON = 0.01d;
protected static final double DEFAULT_MIN_PAGE_RANK = 1d;
private final double resetProb = DEFAULT_RESET_PROB;
private final double epsilon = DEFAULT_EPSILON;
private final int limit = DEFAULT_LIMIT;
private final double minPageRank = DEFAULT_MIN_PAGE_RANK;
public static class VS {
/**
* The current computed value for this vertex.
*
* All vertices are initialized to the reset probability. They are then
* updated in each iteration to the new estimated value by apply().
*/
private double value;
/**
* The number of out-edges. This is computed once, when the vertex state
* is initialized.
*/
private long outEdges;
/**
* The last delta observed for this vertex.
*/
private double lastChange = 0d;
/**
* The current computed value for this vertex.
*
* All vertices are initialized to the reset probability. They are then
* updated in each iteration to the new estimated value by apply().
*/
public double getValue() {
synchronized (this) {
return value;
}
}
@Override
public String toString() {
return "{value=" + value + ",lastChange=" + lastChange + "}";
}
}// class VS
/**
* Edge state is not used.
*/
public static class ES {
}
private static final Factory vertexStateFactory = new Factory() {
@Override
public PR.VS initialValue(final Value value) {
return new VS();
}
};
@Override
public Factory getVertexStateFactory() {
return vertexStateFactory;
}
@Override
public Factory getEdgeStateFactory() {
return null;
}
@Override
public FrontierEnum getInitialFrontierEnum() {
return FrontierEnum.AllVertices;
}
@Override
public EdgesEnum getGatherEdges() {
return EdgesEnum.InEdges;
}
@Override
public EdgesEnum getScatterEdges() {
return EdgesEnum.OutEdges;
}
/**
* {@inheritDoc}
*
* Each vertex is initialized to the reset probability.
*
* FIXME We need to do this efficiently. E.g., using a scan to find all of
* the vertices together with their in-degree or out-degree. That should be
* done to populate the frontier, initializing the #of out-edges at the same
* time.
*/
@Override
public void initVertex(final IGASContext ctx,
final IGASState state, final Value u) {
final PR.VS us = state.getState(u);
synchronized (us) {
us.value = resetProb;
us.outEdges = ctx.getGraphAccessor().getEdgeCount(ctx, u,
EdgesEnum.OutEdges);
}
}
/**
* {@inheritDoc}
*
*/
@Override
public Double gather(final IGASState state,
final Value u, final Statement e) {
final Value v = state.getOtherVertex(u, e);
final PR.VS vs = state.getState(v);
/*
* Note: Division by zero should not be possible here since the edge
* that we used to discover [v] is an out-edge of [v].
*/
synchronized (vs) {
return (vs.value / vs.outEdges);
}
}
/**
* SUM
*
* {@inheritDoc}
*/
@Override
public Double sum(final IGASState state,
final Double left, final Double right) {
return left + right;
}
/**
* {@inheritDoc}
*
* Compute the new value for this vertex, making a note of the last change
* for this vertex.
*/
@Override
public PR.VS apply(final IGASState state,
final Value u, final Double sum) {
final PR.VS us = state.getState(u);
if (sum == null) {
/*
* No in-edges visited by Gather. No change. Vertex will be dropped
* from the frontier.
*/
synchronized (us) {
us.lastChange = 0d;
}
return null;
}
final double newval = resetProb + (1.0 - resetProb) * sum;
synchronized (us) {
us.lastChange = (newval - us.value);
us.value = newval;
}
return us;
}
/**
* {@inheritDoc}
*
* Returns true
iff the last change was greater then epsilon.
*/
@Override
public boolean isChanged(final IGASState state,
final Value u) {
final PR.VS us = state.getState(u);
return us.lastChange > epsilon;
}
/**
* The remote vertex is scheduled for activation unless it has already been
* visited.
*
* Note: We are scattering to out-edges. Therefore, this vertex is
* {@link Statement#getSubject()}. The remote vertex is
* {@link Statement#getObject()}.
*/
@Override
public void scatter(final IGASState state,
final IGASScheduler sch, final Value u, final Statement e) {
final Value v = state.getOtherVertex(u, e);
sch.schedule(v);
}
/**
* {@inheritDoc}
*
* Continue unless the iteration limit has been reached.
*/
@Override
public boolean nextRound(final IGASContext ctx) {
return ctx.getGASState().round() < limit;
}
/**
* {@inheritDoc}
*
*
* - {@value Bindings#RANK}
* - The page rank associated with the vertex..
*
*/
@Override
public List> getBinderList() {
final List> tmp = super.getBinderList();
tmp.add(new BinderBase() {
@Override
public int getIndex() {
return Bindings.RANK;
}
@Override
public Value bind(final ValueFactory vf,
final IGASState state, final Value u) {
return vf.createLiteral(state.getState(u).getValue());
}
});
return tmp;
}
/**
* Additional {@link IBindingExtractor.IBinder}s exposed by {@link PR}.
*
* @author Bryan Thompson
*/
public interface Bindings extends BaseGASProgram.Bindings {
/**
* The computed page rank for the vertex.
*/
int RANK = 1;
}
/**
* Class reports a map containing the page rank associated with each visited
* vertex.
*
* @author Bryan
* Thompson
*/
public class PageRankReducer implements IReducer> {
private final ConcurrentHashMap values = new ConcurrentHashMap();
@Override
public void visit(final IGASState state,
final Value u) {
final VS us = state.getState(u);
if (us != null) {
final double pageRank = us.getValue();
// FIXME Why are NaNs showing up?
if (Double.isNaN(pageRank))
return;
// FIXME Do infinite values show up?
if (Double.isInfinite(pageRank))
return;
if (pageRank < minPageRank) {
// Ignore small values.
return;
}
/*
* Only report the larger ranked values.
*/
if (log.isDebugEnabled())
log.debug("v=" + u + ", pageRank=" + pageRank);
values.put(u, Double.valueOf(pageRank));
}
}
@Override
public Map get() {
return Collections.unmodifiableMap(values);
}
}
// @Override
// public void after(final IGASContext ctx) {
//
// final Map values = ctx.getGASState().reduce(
// new PageRankReducer());
//
// class NV implements Comparable {
// public final double n;
// public final Value v;
// public NV(double n, Value v) {
// this.n = n;
// this.v = v;
// }
// @Override
// public int compareTo(final NV o) {
// if (o.n > this.n)
// return 1;
// if (o.n < this.n)
// return -1;
// return 0;
// }
// }
//
// final NV[] a = new NV[values.size()];
//
// int i = 0;
//
// for (Map.Entry e : values.entrySet()) {
//
// a[i++] = new NV(e.getValue().doubleValue(), e.getKey());
//
// }
//
// Arrays.sort(a);
//
// System.out.println("rank, pageRank, vertex");
// i = 0;
// for (NV t : a) {
//
// System.out.println(i + ", " + t.n + ", " + t.v);
//
// i++;
//
// }
//
// }
}