All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.cassovary.graph.ArrayBasedDirectedGraph.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2014 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
 * file except in compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */
package com.twitter.cassovary.graph

import com.google.common.annotations.VisibleForTesting
import com.google.common.util.concurrent.MoreExecutors
import com.twitter.cassovary.graph.node._
import com.twitter.cassovary.graph.StoredGraphDir._
import com.twitter.cassovary.util.ExecutorUtils
import com.twitter.logging.Logger
import com.twitter.ostrich.stats.Stats
import java.util.concurrent.atomic.AtomicInteger
import java.util.concurrent.{ExecutorService, Future}
import scala.collection.mutable

/**
 * This case class holds a node's id, all its out edges, and the max
 * id of itself and ids of nodes in its out edges
 */
case class NodeIdEdgesMaxId(var id: Int, var edges: Array[Int], var maxId: Int)
object NodeIdEdgesMaxId {
  def apply(id: Int, edges: Array[Int]) =
      new NodeIdEdgesMaxId(id, edges, edges.foldLeft[Int](id)((x, y) => x max y))
}

/**
 * This case class holds a part of the total graph loaded in one thread
 * it consists of a seq of nodes with out-edges, a max overall id, and
 * a max id of nodes with out-edges
 */
private case class NodesMaxIds(nodesInOneThread: Seq[Node],
    maxIdInPart: Int, nodeWithOutEdgesMaxIdInPart: Int)

/**
 * provides methods for constructing an array based graph
 */
object ArrayBasedDirectedGraph {
  private lazy val log = Logger.get

  /**
   * Construct an array-based graph from an sequence of iterators over NodeIdEdgesMaxId
   * This function builds the array-based graph from a seq of nodes with out edges
   * using the following steps:
   * 0. read from file and construct a sequence of Nodes
   * 1. create an array of size maxNodeid
   * 2. mark all positions in the array where there is a node
   * 3. instantiate nodes that only have in-edges (thus has not been created in the input)
   * next steps apply only if in-edge is needed
   * 4. calculate in-edge array sizes
   * 5. (if in-edge dir only) remove out edges
   * 6. instantiate in-edge arrays
   * 7. iterate over the sequence of nodes again, instantiate in-edges
   */
  def apply(iteratorSeq: Seq[ () => Iterator[NodeIdEdgesMaxId] ], executorService: ExecutorService,
      storedGraphDir: StoredGraphDir) = {

    val nodesOutEdges = new mutable.ArrayBuffer[Seq[Node]]
    var maxNodeId = 0
    var nodeWithOutEdgesMaxId = 0
    var numEdges = 0L
    var numNodes = 0

    log.debug("loading nodes and out edges from file in parallel")
    val futures = Stats.time("graph_dump_load_partial_nodes_and_out_edges_parallel") {
      def readOutEdges(iteratorFunc: () => Iterator[NodeIdEdgesMaxId]) =
          Stats.time("graph_load_read_out_edge_from_dump_files") {
        val nodes = new mutable.ArrayBuffer[Node]
        var newMaxId = 0
        var varNodeWithOutEdgesMaxId = 0
        var id = 0
        var edgesLength = 0
        var edges: Array[Int] = Array.empty[Int]

        val iterator = iteratorFunc()
        iterator foreach { item =>
          id = item.id
          newMaxId = newMaxId max item.maxId
          varNodeWithOutEdgesMaxId = varNodeWithOutEdgesMaxId max item.id
          val edges = item.edges
          edgesLength = edges.length
          val newNode = ArrayBasedDirectedNode(id, edges, storedGraphDir)
          nodes += newNode
        }
        NodesMaxIds(nodes, newMaxId, varNodeWithOutEdgesMaxId)
      }

      ExecutorUtils.parallelWork[ () => Iterator[NodeIdEdgesMaxId], NodesMaxIds](executorService,
          iteratorSeq, readOutEdges)
    }

    futures.toArray map { future =>
      val f = future.asInstanceOf[Future[NodesMaxIds]]
      val NodesMaxIds(nodesInOneThread, maxIdInPart, nodeWithOutEdgesMaxIdInPart) = f.get
      nodesOutEdges += nodesInOneThread
      maxNodeId = maxNodeId max maxIdInPart
      nodeWithOutEdgesMaxId = nodeWithOutEdgesMaxId max nodeWithOutEdgesMaxIdInPart
    }

    val nodeIdSet = new Array[Byte](maxNodeId + 1)
    val table = new Array[Node](maxNodeId + 1)

    log.debug("mark the ids of all stored nodes in nodeIdSet")
    Stats.time("graph_load_mark_ids_of_stored_nodes") {
      def markAllNodes = {
        (nodes: Seq[Node]) => {
          nodes foreach { node =>
            val nodeId = node.id
            assert(table(nodeId) == null, "Duplicate node detected. (" + nodeId + ")")
            table(nodeId) = node
            nodeIdSet(nodeId) = 1
            storedGraphDir match {
              case StoredGraphDir.OnlyIn =>
                node.inboundNodes foreach { inEdge => nodeIdSet(inEdge) = 1 }
              case _ =>
                node.outboundNodes foreach { outEdge => nodeIdSet(outEdge) = 1 }
            }
          }
        }
      }
      ExecutorUtils.parallelForEach[Seq[Node], Unit](executorService, nodesOutEdges, markAllNodes)
    }

    // creating nodes that have only in edges but no out edges
    // also calculates the total number of edges
    val nodesWithNoOutEdges = new mutable.ArrayBuffer[Node]
    var nodeWithOutEdgesCount = 0
    log.debug("creating nodes that have only in-coming edges")
    Stats.time("graph_load_creating_nodes_without_out_edges") {
      for ( id <- 0 to maxNodeId ) {
        if (nodeIdSet(id) == 1) {
          numNodes += 1
          if (table(id) == null) {
            val node = ArrayBasedDirectedNode(id, ArrayBasedDirectedNode.noNodes, storedGraphDir)
            table(id) = node
            if (storedGraphDir == StoredGraphDir.BothInOut)
              nodesWithNoOutEdges += node
          } else {
            nodeWithOutEdgesCount += 1
            storedGraphDir match {
              case StoredGraphDir.OnlyIn =>
                numEdges += table(id).inboundNodes.size
              case _ =>
                numEdges += table(id).outboundNodes.size
            }
          }
        }
      }
    }

    // the work below is needed for BothInOut directions only
    if (storedGraphDir == StoredGraphDir.BothInOut) {
      val allNodes = new mutable.ArrayBuffer[Seq[Node]]
      allNodes ++= nodesOutEdges
      allNodes += nodesWithNoOutEdges

      log.debug("calculating in edges sizes")
      val inEdgesSizes = findInEdgesSizes()

      def populateInEdges() {
        def readInEdges = Stats.time("graph_load_read_in_edge_from_dump_files") {
          (nodes: Seq[Node]) => {
            nodes foreach { node =>
              node.outboundNodes foreach { outEdge =>
                val index = inEdgesSizes(outEdge).getAndIncrement()
                table(outEdge).asInstanceOf[BiDirectionalNode].inEdges(index) = node.id
              }
            }
          }
        }
        ExecutorUtils.parallelForEach[Seq[Node], Unit](executorService, nodesOutEdges, readInEdges)
      }

      def instantiateInEdges() {
        Stats.time("graph_load_instantiate_in_edge_arrays") {
          def instantiateInEdgesTask = {
            (nodes: Seq[Node]) => {
              nodes foreach { node =>
                val biDirNode = node.asInstanceOf[BiDirectionalNode]
                val nodeId = biDirNode.id
                val edgeSize = inEdgesSizes(nodeId).intValue()
                if (edgeSize > 0)
                  biDirNode.inEdges = new Array[Int](edgeSize)
                // reset inEdgesSizes, and use it as index pointer of
                // the current insertion place when adding in edges
                inEdgesSizes(nodeId).set(0)
              }
            }
          }

          ExecutorUtils.parallelForEach[Seq[Node], Unit](executorService, allNodes,
              instantiateInEdgesTask)
        }
      }

      log.debug("instantiating in edge arrays")
      instantiateInEdges()

      log.debug("populate in edges")
      populateInEdges()
    }

    def findInEdgesSizes() = Stats.time("graph_load_find_in_edge_sizes") {
      val atomicIntArray = new Array[AtomicInteger](maxNodeId + 1)
      var id = 0
      while (id <= maxNodeId) {
        if (nodeIdSet(id) == 1) {
          atomicIntArray(id) = new AtomicInteger()
        }
        id += 1
      }
      def findInEdgeSizesTask = {
        (nodes: Seq[Node]) => {
          nodes foreach { node =>
            node.outboundNodes foreach { outEdge => atomicIntArray(outEdge).incrementAndGet() }
          }
        }
      }
      ExecutorUtils.parallelForEach[Seq[Node], Unit](executorService,
          nodesOutEdges, findInEdgeSizesTask)
      atomicIntArray
    }

    new ArrayBasedDirectedGraph(table.asInstanceOf[Array[Node]], maxNodeId,
      nodeWithOutEdgesMaxId, nodeWithOutEdgesCount, numNodes, numEdges, storedGraphDir)
  }
  @VisibleForTesting
  def apply( iteratorFunc: () => Iterator[NodeIdEdgesMaxId],
        storedGraphDir: StoredGraphDir): ArrayBasedDirectedGraph =
    apply(Seq(iteratorFunc), MoreExecutors.sameThreadExecutor(), storedGraphDir)
}


/**
 * This class is an implementation of the directed graph trait that is backed by an array
 * The private constructor takes as its input a list of (@see Node) nodes, then stores
 * nodes in an array. It also builds all edges which are also stored in array.
 *
 * @param nodes the list of nodes with edges instantiated
 * @param maxId the max node id in the graph
 * @param nodeCount the number of nodes in the graph
 * @param edgeCount the number of edges in the graph
 * @param storedGraphDir the graph direction(s) stored
 */
class ArrayBasedDirectedGraph private (nodes: Array[Node], maxId: Int,
    val nodeWithOutEdgesMaxId: Int,
    val nodeWithOutEdgesCount: Int, val nodeCount: Int, val edgeCount: Long,
    val storedGraphDir: StoredGraphDir) extends DirectedGraph {

  override lazy val maxNodeId = maxId

  def iterator = nodes.iterator.filter { _ != null }

  def getNodeById(id: Int) = {
    if (id >= nodes.size) {
      None
    } else {
      val node = nodes(id)
      if (node == null) {
        None
      } else {
        Some(node)
      }
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy