
com.twitter.cassovary.graph.GraphUtils.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2014 Twitter, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
* file except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
package com.twitter.cassovary.graph
import com.twitter.cassovary.graph.GraphDir._
import com.twitter.cassovary.graph.tourist._
import com.twitter.logging.Logger
import com.twitter.ostrich.stats.Stats
import it.unimi.dsi.fastutil.ints.{Int2IntMap, Int2ObjectMap}
import it.unimi.dsi.fastutil.objects.Object2IntMap
import scala.util.Random
/**
* This class contains some common graph utilities and convenience functions.
*/
class GraphUtils(val graph: Graph) {
import GraphUtils._
private val log = Logger.get
/**
* Do a walk on the {@code nodes} in a graph.
* @param nodes the nodes to visit in this walk
* @param tourists each tourist maintains some state and updates that state on visiting a node
* @return Seq of tourist-specific-returned information
*/
def walk(nodes: Iterator[Node], walkFunc: (Node => Unit)) = {
nodes foreach { node =>
walkFunc(node)
}
}
/**
* The following are different types of graph traversals (aka "walks").
*/
/**
* This is a breadth-first walk along the direction specified by {@code dir}.
* @param startNodeId(s) node(s) to start the random walk from. These must exist in the graph.
* @param walkParams the parameters specifying this random walk
* @return a tuple of two elements
* The first is a counter tracking a visited node's id V and the number of visits to that node.
* The second is a counter tracking a visited node's id V and a set of neighbors. The neighbors
* are sorted in decreasing order by occurrence
*/
def bfsWalk(dir: GraphDir, startNodeId: Int, walkParams: RandomWalkParams)():
(VisitsCounter, PrevNbrCounter) = {
val visitsCounter = new VisitsCounter
val prevNbrCounter = new PrevNbrCounter(walkParams.numTopPathsPerNode, walkParams.visitSameNodeOnce)
if (graph.existsNodeId(startNodeId)) {
val traversedNodes = new BreadthFirstTraverser(graph, dir, Seq(startNodeId),
walkParams.maxDepth, walkParams.maxNumEdgesThresh, walkParams.numSteps,
walkParams.visitSameNodeOnce, Some(prevNbrCounter))
with BoundedIterator[Node] {
val maxSteps = walkParams.numSteps
}
Stats.incr("bfs_walk_request", 1)
Stats.time ("bfs_walk_traverse") {
walk(traversedNodes, { node =>
// prevNbrCounter is mutated within BreadthFirstTraverser
visitsCounter.visit(node)
})
}
}
(visitsCounter, prevNbrCounter)
}
/**
* Do a random walk starting from the set of nodes with ids {@code startNodeIds}.
* The walk maintains a count of the number of times that nodes have been visited
* during the walk.
* @param startNodeIds nodes to start the random walk from
* @param walkParams the {@link RandomWalkParams} random walk parameters
* @return a tuple of two elements.
* The first is a counter tracking a visited node's id to the number of visits to that node.
* The second is a counter tracking a visited node's id to the paths visited while hitting
* that node. The paths are sorted in decreasing order by occurrence
* Each path is kept as a {@link DirectedPath}.
*/
def randomWalk(dir: GraphDir, startNodeIds: Seq[Int], walkParams: RandomWalkParams)():
(VisitsCounter, Option[PathsCounter]) = {
val startNodesExist = (startNodeIds.length > 0) && startNodeIds.foldLeft(true) { (exists, elem) =>
exists && graph.existsNodeId(elem)
}
val visitsCounter = new VisitsCounter
val pathsCounterOption = walkParams.numTopPathsPerNode match {
case Some(k) if (k > 0) => Some(new PathsCounter(k, startNodeIds))
case _ => None
}
if (startNodesExist) {
val traversedNodes = new RandomBoundedTraverser(graph, dir, startNodeIds,
walkParams.numSteps, walkParams)
Stats.time("random_walk_traverse") {
walk(traversedNodes, { node =>
visitsCounter.visit(node)
if (pathsCounterOption.isDefined) {
pathsCounterOption.get.visit(node)
}
})
}
}
(visitsCounter, pathsCounterOption)
}
/**
* Calculates the reputation of graph nodes personalized to a given node based on a random walk.
* @param startNodeIds the ids of the node to get personalized reputations for
* @param walkParams the {@link RandomWalkParams} random walk parameters
* @return a 2-tuple:
* 1. List of (node's id, the number of visits made to the node) sorted in decreasing
* order of the number of visits, and
* 2. A mapping, for a visited node with id V to the top paths leading to V
* in the form of (P as a {@link DirectedPath}, frequency of walking P).
*/
def calculatePersonalizedReputation(startNodeIds: Seq[Int], walkParams: RandomWalkParams):
(Int2IntMap, Option[Int2ObjectMap[Object2IntMap[DirectedPath]]]) = {
Stats.time ("%s_total".format("PTC")) {
val (visitsCounter, pathsCounterOption) = randomWalk(walkParams.dir, startNodeIds, walkParams)
val topPathsOption = pathsCounterOption flatMap { counter => Some(counter.infoAllNodes) }
(visitsCounter.infoAllNodes, topPathsOption)
}
}
def calculatePersonalizedReputation(startNodeId: Int, walkParams: RandomWalkParams):
(Int2IntMap, Option[Int2ObjectMap[Object2IntMap[DirectedPath]]]) = {
calculatePersonalizedReputation(Seq(startNodeId), walkParams)
}
/**
* Does a breadth-first-walk starting from {@code startNodeId} using the walk
* parameters specified in {@code walkParams}. Returns a 2-tuple:
* 1. List of (node's id, the number of visits made to the node) sorted in decreasing
* order of the number of visits, and
* 2. A mapping for a visited node with id V to the top paths leading to V
* in the form of (P as a {@link DirectedPath}, frequency of walking P).
*/
def calculateBFS(startNodeId: Int, walkParams: RandomWalkParams):
(Int2IntMap, Int2ObjectMap[Int2IntMap]) = {
Stats.time("%s_total".format("BFS")) {
val (visitsCounter, prevNbrCounter) = bfsWalk(walkParams.dir, startNodeId, walkParams)
(visitsCounter.infoAllNodes, prevNbrCounter.infoAllNodes)
}
}
/**
* @param id the id of the node to count neighbors of
* @param dir the direction of interest
* @return number of neighbors (i.e., number of nodes 1 hop away) in {@code dir}.
* Warning: Lossy: Returns 0 if id is not found.
*/
def neighborCount(id: Int, dir: GraphDir) = funcById(id, dir,
(nd: Node, dir: GraphDir) => nd.neighborCount(dir), 0)
// helper to be used in functions that take nodeId and need to log warnings
// in the case that the id is not found
private def onNodeIdNotFound(desc: String, id: Int) {
log.warning("(%s) Node with id = %d not found!", desc, id)
}
// convenience wrapper for methods that run on node ids
private def funcById[T](id: Int, dir: GraphDir, f: (Node, GraphDir) => T): Option[T] = {
graph.getNodeById(id) match {
case Some(node) => Some(f(node, dir))
case None => None
}
}
// convenience wrapper for methods that provide a default value if node is not found
private def funcById[T](id: Int, dir: GraphDir, f: (Node, GraphDir) => T, defaultVal: T): T = {
funcById(id, dir, f).getOrElse(defaultVal)
}
}
object GraphUtils {
/**
* Parameters of a walk (such as a random walk, or a breadth first walk).
* @param numSteps number of steps to take in the walk
* @param resetProbability the probability with which to reset back to {@code startNodeId}.
* Must lie between 0.0 and 1.0, both inclusive. Ignored for non-random walks
* @param maxNumEdgesThresh Max number of edges allowed for a node
* beyond which the next random step is {@code startNodeId} regardless of anything else
* @param numTopPathsPerNode the number of top paths to {@code node} to maintain, None if we don't
* want to maintain them at all
* @param maxDepth the maximum depth to visit in depth-related search (e.g. startNodeId has
* depth 0, its immediate neighbors have depth 1, etc.). None if we don't want to maintain
* them at all
* @param visitSameNodeOnce if true, the walk would only visit the same node once
* and will likely restart to a start node when visiting the same node twice
* @param dir traverse out-direction or in-direction
* @param stable if true, use a fixed random number generator in the random walk
* @param filterHomeNodeByNumEdges if true, home node will be checked for maxNumEdgesThresh
* when visited during random walk
*/
case class RandomWalkParams(numSteps: Long,
resetProbability: Double,
maxNumEdgesThresh: Option[Int] = None,
numTopPathsPerNode: Option[Int] = None,
maxDepth: Option[Int] = None,
visitSameNodeOnce: Boolean = false,
dir: GraphDir = GraphDir.OutDir,
stable: Boolean = false,
filterHomeNodeByNumEdges: Boolean = false) {
/**
* The pruning function that guides the pruning of a random walk. If the random walk
* reached a node, it resets to the starting node if pruneFn returns true for that node.
* Can be overridden by subclasses.
*/
val pruneFn: Node => Boolean = NodeUtils.hasTooManyEdges(OutDir, maxNumEdgesThresh)
private val seed = 268430371266199L
lazy val randNumGen = if (stable) new Random(seed) else new Random
require(resetProbability >= 0 && resetProbability <= 1.0,
"reset probability must be between 0.0 and 1.0")
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy