com.twitter.cassovary.graph.SharedArrayBasedDirectedGraph.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassovary-core_2.11 Show documentation
cassovary-core
The newest version!
package com.twitter.cassovary.graph

import com.google.common.annotations.VisibleForTesting
import com.twitter.cassovary.graph.StoredGraphDir._
import com.twitter.cassovary.graph.node._
import com.twitter.cassovary.collections.CSeq
import com.twitter.cassovary.util.collections.Int2IntMap
import com.twitter.cassovary.util.{Int2ObjectMap, ArrayBackedSet, BoundedFuturePool, Sharded2dArray}
import com.twitter.finagle.stats.{Stat, DefaultStatsReceiver}
import com.twitter.logging.Logger
import com.twitter.util.{Await, Future, FuturePool}

import java.util.concurrent.atomic.AtomicInteger

import com.twitter.cassovary.collections.CSeq.Implicits._

object SharedArrayBasedDirectedGraph {
  private lazy val log = Logger.get()
  private val statsReceiver = DefaultStatsReceiver
  val emptyArray = Array.empty[Int]

  private object EdgeShards {
    private val numBits = 8
    val numShards = 1 << numBits
    private val mask = numShards - 1

    def hash(i: Int) = i & mask
  }

  /**
   * Construct a shared array-based graph from a sequence of NodeIdEdgesMaxId iterables.
   * Eg each Iterable[NodeIdEdgesMaxId] could come from one graph dump file.
   *
   * This function builds the graph using similar steps as in ArrayBasedDirectedGraph.
   * The main difference here is that instead of each node storing neighbor ids in
   * separate arrays of ids, here one shared array is used for all neighbor ids.
   * Thus each node can find its edges through an offset into this shared array.
   * To avoid huge arrays, this edge array is also sharded based on node's id.
   *
   * @param iterableSeq the sequence of nodes each with its own edges
   * @param parallelismLimit number of threads construction uses
   * @param storedGraphDir the direction of the graph to be built
   * @param forceSparseRepr if Some(true), the code saves storage at the expense of speed
   *                        by using ConcurrentHashMap instead of Array. If Some(false), chooses
   *                        Array instead. If None, the code calculates whether the graph
   *                        is sparse based on the number of nodes and maximum node id.
   */
  def apply(iterableSeq: Seq[Iterable[NodeIdEdgesMaxId]], parallelismLimit: Int,
      storedGraphDir: StoredGraphDir, forceSparseRepr: Option[Boolean] = None) = {
    val constructor = new SharedArrayBasedDirectedGraphConstructor(iterableSeq, parallelismLimit,
      storedGraphDir, forceSparseRepr)
    constructor.construct()
  }

  private class SharedArrayBasedDirectedGraphConstructor(
    iterableSeq: Seq[Iterable[NodeIdEdgesMaxId]],
    parallelismLimit: Int,
    storedGraphDir: StoredGraphDir,
    forceSparseRepr: Option[Boolean] = None) {

    private val futurePool = new BoundedFuturePool(FuturePool.unboundedPool, parallelismLimit)

    // info kept for each shard while building
    private class PerShardInfo(val shardNum: Int) {
      val numEdges = new AtomicInteger()
      val nextFreeEdgeIndex = new AtomicInteger()

      // used in reverse edges
      var numIdsMapped = 0
      var idsMapped: Array[Int] = _
    }
    private def newShardsInfo() = Array.tabulate(EdgeShards.numShards)(i => new PerShardInfo(i))
    private val shardsInfo = newShardsInfo()

    /**
     * For each shard, go over each NodeIdEdgesMaxId and calculate the size of this shard by
     * summing the edge lengths of all nodes that fall into this shard.
     * An edge falls in a shard when the source node id is hashed to the index of the shard.
     *
     * @return shared graph meta-information object with filled all information but node count
     */
    private def readMetaInfoPerShard():
    Future[Seq[SharedGraphMetaInfo]] = {
      log.debug("read out num of edges and max id from files in parallel")
      val stat = statsReceiver.stat("graph_load_read_out_edge_sizes_dump_files")
      Stat.timeFuture(stat) {
        val futures = iterableSeq map {
          edgesIterable => futurePool {
            var id, newMaxId, numOfEdges, edgesLength, numNodes, shardNum = 0
            val iteratorForEdgeSizes = edgesIterable.iterator
            iteratorForEdgeSizes foreach { item =>
              id = item.id
              newMaxId = newMaxId max item.maxId
              edgesLength = item.edges.length
              shardNum = EdgeShards.hash(id)
              // +1 because we will keep edgesLength in the very first index
              shardsInfo(shardNum).numEdges.addAndGet(edgesLength + 1)
              numOfEdges += edgesLength
              numNodes += 1
            }
            SharedGraphMetaInfo(newMaxId, numOfEdges, numNodes)
          }
        }
        Future.collect(futures)
      }
    }

    /**
     * Aggregate meta-information from parts of graph
     */
    private def aggregateMetaInfoFromParts(partsMetaInfo: Seq[SharedGraphMetaInfo]):
      SharedGraphMetaInfo = {

      def aggregate(meta1: SharedGraphMetaInfo, meta2: SharedGraphMetaInfo) = {
        SharedGraphMetaInfo(meta1.maxId max meta2.maxId,
          meta1.numEdges + meta2.numEdges, meta1.numNodes + meta2.numNodes)
      }

      partsMetaInfo.reduce(aggregate)
    }

    /**
     * Instantiates shared array. Resets `sharedEdgeArraySizeCount`.
     */
    private def instantiateSharedArray(shardsInfo: Array[PerShardInfo]):
    Array[Array[Int]] = {
      // instantiate shared array (2-dimensional)
      val sharedEdgeArray = new Array[Array[Int]](EdgeShards.numShards)
      for (i <- 0 until EdgeShards.numShards) {
        sharedEdgeArray(i) = new Array[Int](shardsInfo(i).numEdges.get)
      }
      sharedEdgeArray
    }

    private def fillEdgesMarkNodeOffsets(iterableSeq: Seq[Iterable[NodeIdEdgesMaxId]],
                                         sharedEdgeArray: Array[Array[Int]],
                                         nodeCollection: NodeCollection,
                                         shardsInfo: Array[PerShardInfo],
                                         partsMetaInfo: Seq[SharedGraphMetaInfo]): Future[Unit] = {
      log.debug("loading nodes and out edges from file in parallel and marking all nodes")
      val allNodeIdsSet = new ArrayBackedSet(nodeCollection.maxNodeId)

      val futures = iterableSeq.indices.map {
        index => futurePool {
          val edgesIterable = iterableSeq(index)
          val ids = new Array[Int](partsMetaInfo(index).numNodes)
          val offsets = new Array[Int](partsMetaInfo(index).numNodes)
          var id, edgesLength, shardIdx, offset, i = 0
          edgesIterable.foreach { item =>
            id = item.id
            edgesLength = item.edges.length
            shardIdx = EdgeShards.hash(id)
            offset = shardsInfo(shardIdx).nextFreeEdgeIndex.getAndAdd(edgesLength + 1)
            Array.copy(item.edges, 0, sharedEdgeArray(shardIdx), offset + 1, edgesLength)
            ids(i) = id
            offsets(i) = offset + 1
            i += 1
            sharedEdgeArray(shardIdx)(offset) = edgesLength
            item.edges foreach { edge => allNodeIdsSet.add(edge) }
          }
          (ids, offsets)
        }
      }

      Future.collect(futures).map { idsAndOffsetsAll =>
        // serialize addition to nodeCollection
        // don't need to keep allNodeIdsSet around, encode in offsets()
        allNodeIdsSet.foreach { i => nodeCollection.updateOffset(i, 0) }
        idsAndOffsetsAll foreach { case (ids, offsets) =>
            ids.indices foreach { i => nodeCollection.updateOffset(ids(i), offsets(i)) }
        }
      }
    }

    private def sharded2dArray(nc: NodeCollection, sharedEdgeArray: Array[Array[Int]]) = {

      def numEdges(id: Int) = {
        if (nc.emptyNode(id)) 0
        else {
          sharedEdgeArray(EdgeShards.hash(id))(nc.getEdgeOffset(id) - 1)
        }
      }

      new Sharded2dArray[Int](sharedEdgeArray, nc.validNode, nc.offsets, numEdges, EdgeShards.hash)
    }


    def construct(): SharedArrayBasedDirectedGraph = {

      val future = for {
        partsMetaInfo <- readMetaInfoPerShard()
        metaInfo = aggregateMetaInfoFromParts(partsMetaInfo)
        nodeCollection = new NodeCollection(metaInfo, forceSparseRepr)
        sharedEdgeArray = instantiateSharedArray(shardsInfo)
        _ <- fillEdgesMarkNodeOffsets(iterableSeq, sharedEdgeArray, nodeCollection,
          shardsInfo, partsMetaInfo)
        outEdges = sharded2dArray(nodeCollection, sharedEdgeArray)
        reverseDirEdgeArray <-
          if (storedGraphDir == BothInOut)
            createReverseDirEdgeArray(outEdges, nodeCollection)
          else
            Future.value(None)
      } yield new SharedArrayBasedDirectedGraph(nodeCollection, outEdges,
          reverseDirEdgeArray, metaInfo, storedGraphDir)

      val graph = Await.result(future)
      log.debug("DONE")
      graph
    }

    /**
     * Creates an array for reverse direction edges.
     *
     * Needed only if storing both directions.
     */
    private def createReverseDirEdgeArray(outEdges: Sharded2dArray[Int], nodeCollection: NodeCollection):
    Future[Option[Sharded2dArray[Int]]] = {

      val numNodes = nodeCollection.size
      val nodesWithInEdges = new NodeCollection(nodeCollection.graphInfo, forceSparseRepr)
      val reverseShardsInfo = newShardsInfo()

      val inEdgesSizes = Int2ObjectMap[AtomicInteger](nodeCollection.considerGraphSparse,
        Some(nodeCollection.graphInfo.numNodes), Some(nodeCollection.maxNodeId),isConcurrent = false)

      def partitionNodeIdsPerShard() = {
        nodeCollection foreach { id =>
          shardsInfo(EdgeShards.hash(id)).numIdsMapped += 1
          inEdgesSizes.update(id, new AtomicInteger())
        }
        shardsInfo foreach { shardInfo =>
          shardInfo.idsMapped = new Array[Int](shardInfo.numIdsMapped)
          shardInfo.numIdsMapped = 0 //reusing this variable
        }
        // fill out the array per shard by binning nodeCollection's ids into each shard
        nodeCollection foreach { id =>
          val shard = shardsInfo(EdgeShards.hash(id))
          shard.idsMapped(shard.numIdsMapped) = id
          shard.numIdsMapped += 1
        }
      }

      // do function f(shardNum, id, idIndex) for all nodes, divided into EdgeShards.numShards futures
      def doForAllNodeIdsDetail(f: (Int, Int, Int) => Unit): Future[Unit] = {
        val futures = shardsInfo map { oneShardInfo =>
          futurePool {
            for (idIndex <- 0 until oneShardInfo.numIdsMapped)
              f(oneShardInfo.shardNum, oneShardInfo.idsMapped(idIndex), idIndex)
          }
        }
        Future.join(futures)
      }

      def doForAllNodeIds(f: Int => Unit): Future[Unit] = {
        doForAllNodeIdsDetail { (_, id, _) => f(id) }
      }

      def findInEdgesSizes() = {
        log.debug("calculating incoming neighbor sizes for each node")
        doForAllNodeIds { id =>
          outEdges.foreach(id) { neighbor =>
            inEdgesSizes(neighbor).incrementAndGet()
          }
        }
      }

      def findInShardSizes(): Future[Unit] = {
        val offsetsAll = shardsInfo map { oneShardInfo => new Array[Int](oneShardInfo.numIdsMapped) }
        doForAllNodeIdsDetail { (shardNum, id, idIndex) =>
          val len = inEdgesSizes(id).get
          val offset = if (len > 0) {
            reverseShardsInfo(shardNum).numEdges.getAndAdd(1 + len) + 1
          }
          else 0
          offsetsAll(shardNum)(idIndex) = offset
        } map { _ =>
          //update offsets in serial
          shardsInfo foreach { oneShardInfo =>
            val offsetsThisShard = offsetsAll(oneShardInfo.shardNum)
            for (index <- 0 until oneShardInfo.idsMapped.length) {
              nodesWithInEdges.updateOffset(oneShardInfo.idsMapped(index), offsetsThisShard(index))
            }
          }
        }
      }

      def fillInEdgesOffsets(sharedInEdgesArray: Array[Array[Int]]): Future[Unit] = {
        log.debug("filling lengths in 2d array")
        Stat.timeFuture(statsReceiver.stat("graph_load_fill_in_edge_lengths_and_offsets")) {
          doForAllNodeIds { id =>
            val len = inEdgesSizes(id).get
            if (len > 0) {
              val shard = EdgeShards.hash(id)
              val off = nodesWithInEdges.offsets(id)
              sharedInEdgesArray(shard)(off - 1) = len
              inEdgesSizes(id).set(off) // we will start storing actual neighbors starting here
            }
          }
        }
      }

      def fillInEdges(sharedInEdgesArray: Array[Array[Int]]): Future[Unit] = {
        log.debug("filling in edges")
        Stat.timeFuture(statsReceiver.stat("graph_load_fill_in_edges")) {
          doForAllNodeIds { nodeId =>
            outEdges.foreach(nodeId) { neighborId =>
              val shard = sharedInEdgesArray(EdgeShards.hash(neighborId))
              //remember that we re-used inEdgesSizes to point to offset of edges for this neighborId
              shard(inEdgesSizes(neighborId).getAndIncrement) = nodeId
            }
          }
        }
      }

      def sortInEdges(sharedInEdgesArray: Array[Array[Int]]): Future[Unit] = {
        log.debug("sorting in edges in place")
        doForAllNodeIds { nodeId =>
          val offset = nodesWithInEdges.getEdgeOffset(nodeId)
          if (offset > 0) {
            val shardNum = EdgeShards.hash(nodeId)
            val numEdges = sharedInEdgesArray(shardNum)(offset - 1)
            java.util.Arrays.sort(sharedInEdgesArray(shardNum), offset, offset + numEdges)
          }
        }
      }

      // main set of steps to build incoming edges in the graph
      log.info("Now building the reverse direction representation")
      partitionNodeIdsPerShard()
      for {
        _ <- findInEdgesSizes()
        _ <- findInShardSizes()
        sharedInEdges = instantiateSharedArray(reverseShardsInfo)
        _ <- fillInEdgesOffsets(sharedInEdges)
        _ <- fillInEdges(sharedInEdges)
        _ <- sortInEdges(sharedInEdges)
      } yield Some(sharded2dArray(nodesWithInEdges, sharedInEdges))
    }
  }

  @VisibleForTesting
  def apply(iterable: Iterable[NodeIdEdgesMaxId], storedGraphDir: StoredGraphDir):
    SharedArrayBasedDirectedGraph = apply(Seq(iterable), 1, storedGraphDir)
}

private class NodeCollection(val graphInfo: SharedGraphMetaInfo, forceSparsity: Option[Boolean] = None)
    extends Iterable[Int] {

  val maxNodeId = graphInfo.maxId
  val considerGraphSparse: Boolean = forceSparsity getOrElse {
    // sparse if number of nodes is much less than maxNodeId, AND
    // number of edges is also less than maxNodeId. If number of edges
    // were similar to or greater than maxNodeId, then the extra overhead of allocating
    // an array of size maxNodeId is not too much relative to the storage occupied by
    // the edges themselves
    (graphInfo.numNodes * 8 < maxNodeId) && (graphInfo.numEdges < 4 * maxNodeId)
  }

  /**
   * Encoding of node with id=i with just one array `offsets` of ints:
   * 1. i exists in the graph. edgeOffsets(i) > 0 and length(i) = arr(edgeOffsets(i)-1).
   *    Edges(i) are in arr[edgeOffsets(i)..edgeOffsets(i)+length(i)-1]
   * 2. i is an empty node with no edges. edgeOffsets(i) == 0
   * (Obsoleted when we are not using a full array: 3. i does not exist in the graph. edgeOffsets(i) == -1)
   */
  //val edgeOffsets = Array.fill[Int](maxNodeId + 1)(-1)
  private val edgeOffsets = Int2IntMap(isSparse = considerGraphSparse, numKeysEstimate = None, maxId = Some(maxNodeId))

  def offsets: Int => Int = edgeOffsets

  def updateOffset(i: Int, v: Int) { edgeOffsets(i) = v }
  def getEdgeOffset(id: Int) = edgeOffsets(id)

  def validNode(id: Int) = edgeOffsets.contains(id)
  def hasEdges(id: Int) = validNode(id) && edgeOffsets(id) > 0
  def emptyNode(id: Int) = validNode(id) && edgeOffsets(id) == 0

  // iterator of valid node ids
  def iterator = edgeOffsets.keysIterator

  override lazy val size = edgeOffsets.size
}


/**
 * Contains meta-information about a particular graph instance.
 *
 * @param maxId the max node id in the graph
 * @param numEdges the number of edges in the graph
 */
case class SharedGraphMetaInfo(maxId: Int, numEdges: Long, numNodes: Int)

/**
 * This class is an implementation of the directed graph trait that is backed
 * by a sharded 2-dimensional edges array. Each node's edges are stored
 * consecutively in one shard of the edge array. Number of shard is usually much
 * smaller than number of nodes.
 *
 * @param edges the 2-dimensional sharded edge array
 * @param reverseDirEdges the reverse edge array, one per node
 * @param metaInformation graph meta-information
 */
class SharedArrayBasedDirectedGraph private (
  nodeCollection: NodeCollection,
  edges: Sharded2dArray[Int],
  reverseDirEdges: Option[Sharded2dArray[Int]],
  metaInformation: SharedGraphMetaInfo,
  val storedGraphDir: StoredGraphDir
) extends DirectedGraph[Node] {

  lazy val nodeCount: Int = nodeCollection.size

  lazy val edgeCount: Long = if (isBiDirectional) 2 * metaInformation.numEdges else metaInformation.numEdges

  override lazy val maxNodeId = nodeCollection.maxNodeId

  private def newNode(id: Int) = SharedArrayBasedDirectedNode(id,
          edges, storedGraphDir, reverseDirEdges)

  def iterator = nodeCollection.iterator.map(newNode)

  def getNodeById(id: Int): Option[Node] = {
    if ((id < 0) || (id > maxNodeId) && !nodeCollection.validNode(id)) {
      None
    } else {
      Some(newNode(id))
    }
  }
}