All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.cassovary.PerformanceBenchmark.scala Maven / Gradle / Ivy

/*
 * Copyright 2014 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
 * file except in compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */
package com.twitter.cassovary

import com.twitter.cassovary.graph._
import com.twitter.cassovary.util.io.{AdjacencyListGraphReader, ListOfEdgesGraphReader}
import com.twitter.cassovary.util.SequentialNodeNumberer
import com.twitter.app.Flags
import com.twitter.util.Stopwatch
import java.util.concurrent.Executors
import java.io.File
import scala.collection.mutable.ListBuffer

/**
 * Performance test.
 *
 * Performs PageRank and Personalized PageRank algorithms
 * on several real life graphs from http://snap.stanford.edu/data/. Two small
 * graphs are stored under resources, you can benchmark on larger graphs
 * by providing additional graph urls (they will be downloaded).
 *
 * Usage:
 *   PerformanceBenchmark -h
 * to get started.
 *
 * Example Usages:
 * -local=facebook -globalpr
 * Benchmarks global pagerank on the local facebook graph
 *
 * -url=http://snap.stanford.edu/data/cit-HepTh.txt.gz -ppr
 * Downloads the graph from that URL into local subdir cache/ and runs personalized pagerank on it
 *
 * By default runs every test 10 times and reports average time taken.
 *
 * See: [[http://snap.stanford.edu/data/]]
 */

object PerformanceBenchmark extends App with GzipGraphDownloader {
  /**
   * Directory to store cached graphs downloaded from the web.
   */
  val CACHE_DIRECTORY = "cache/"

  /**
   * Path to the directory storing small graphs.
   */
  val SMALL_FILES_DIRECTORY = "src/main/resources/graphs"

  /**
   * Files to be benchmarked as a list of (directory, name) pairs.
   */
  val files = ListBuffer[(String, String)]()

  lazy val smallFiles = List((SMALL_FILES_DIRECTORY, "facebook"), (SMALL_FILES_DIRECTORY, "wiki-Vote"))

  /**
   * Builders of algorithms to be benchmarked.
   */
  val benchmarks = ListBuffer[(DirectedGraph => OperationBenchmark)]()

  /**
   * Number of repeats of every benchmark.
   */
  val DEFAULT_REPS = 10
  val defaultLocalGraphFile = "facebook"

  val flags = new Flags("Performance benchmark")
  val localFileFlag = flags("local", defaultLocalGraphFile,
    "Specify common prefix of local files in " + SMALL_FILES_DIRECTORY)
  val remoteFileFlag = flags("url",
    "http://snap.stanford.edu/data/cit-HepTh.txt.gz",
    "Specify a URL to download a graph file from")
  val helpFlag = flags("h", false, "Print usage")
  val globalPRFlag = flags("globalpr", false, "run global pagerank benchmark")
  val pprFlag = flags("ppr", false, "run personalized pagerank benchmark")
  val getNodeFlag = flags("gn", 0, "run getNodeById benchmark with a given number of steps")
  val reps = flags("reps", DEFAULT_REPS, "number of times to run benchmark")
  val adjacencyList = flags("a", false, "graph in adjacency list format")
  flags.parseArgs(args)
  if (localFileFlag.isDefined) files += ((SMALL_FILES_DIRECTORY, localFileFlag()))
  if (remoteFileFlag.isDefined) files += cacheRemoteFile(remoteFileFlag())
  if (files.isEmpty) {
    println("No files specified on command line. Taking default graph files facebook and wiki-Vote.")
    files ++= smallFiles
  }
  if (globalPRFlag()) { benchmarks += (g => new PageRankBenchmark(g)) }
  if (pprFlag()) { benchmarks += (g => new PersonalizedPageRankBenchmark(g)) }
  if (getNodeFlag() > 0) { benchmarks += (g => new GetNodeByIdBenchmark(g, getNodeFlag(),
    GraphDir.OutDir))}
  if (helpFlag()) {
    println(flags.usage)
  } else {
    /**
     * Thread pool used for reading graphs. Only useful if multiple files with the same prefix name are present.
     */
    val graphReadingThreadPool = Executors.newFixedThreadPool(4)

    def readGraph(path : String, filename : String, adjacencyList: Boolean) : DirectedGraph = {
      if (adjacencyList) {
        AdjacencyListGraphReader.forIntIds(path, filename, graphReadingThreadPool).toArrayBasedDirectedGraph()
      } else
        ListOfEdgesGraphReader.forIntIds(path, filename, graphReadingThreadPool).toArrayBasedDirectedGraph()
    }

    if (benchmarks.isEmpty) {
      println("No benchmarks specified on command line. Will only read the local graph files.")
    }

    files.foreach {
      case (path, filename) =>
        printf("Reading %s graph from %s\n", filename, path)
        val readingTime = Stopwatch.start()
        val graph = readGraph(path, filename, adjacencyList())
        printf("\tGraph %s loaded from list of edges with %s nodes and %s edges.\n" +
               "\tLoading Time: %s\n", filename, graph.nodeCount, graph.edgeCount, readingTime())
        for (b <- benchmarks) {
          val benchmark = b(graph)
          printf("Running benchmark %s on graph %s...\n", benchmark.name, filename)
          val duration = benchmark.run(reps())
          printf("\tAvg time over %d repetitions: %s.\n", reps(), duration)
        }
    }
    graphReadingThreadPool.shutdown()
  }

  def cacheRemoteFile(url : String) : (String, String) = {
    printf("Downloading remote file from %s\n", url)
    new File(CACHE_DIRECTORY).mkdirs()
    val name = url.split("/").last.split("\\.")(0) + ".txt"
    val target =  CACHE_DIRECTORY + name
    downloadAndUnpack(url, target)
    (CACHE_DIRECTORY, name)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy