All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.graphx.PartitionStrategy.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.graphx

/**
 * Represents the way edges are assigned to edge partitions based on their source and destination
 * vertex IDs.
 */
trait PartitionStrategy extends Serializable {
  /** Returns the partition number for a given edge. */
  def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID
}

/**
 * Collection of built-in [[PartitionStrategy]] implementations.
 */
object PartitionStrategy {
  /**
   * Assigns edges to partitions using a 2D partitioning of the sparse edge adjacency matrix,
   * guaranteeing a `2 * sqrt(numParts)` bound on vertex replication.
   *
   * Suppose we have a graph with 12 vertices that we want to partition
   * over 9 machines.  We can use the following sparse matrix representation:
   *
   * 
   *       __________________________________
   *  v0   | P0 *     | P1       | P2    *  |
   *  v1   |  ****    |  *       |          |
   *  v2   |  ******* |      **  |  ****    |
   *  v3   |  *****   |  *  *    |       *  |
   *       ----------------------------------
   *  v4   | P3 *     | P4 ***   | P5 **  * |
   *  v5   |  *  *    |  *       |          |
   *  v6   |       *  |      **  |  ****    |
   *  v7   |  * * *   |  *  *    |       *  |
   *       ----------------------------------
   *  v8   | P6   *   | P7    *  | P8  *   *|
   *  v9   |     *    |  *    *  |          |
   *  v10  |       *  |      **  |  *  *    |
   *  v11  | * <-E    |  ***     |       ** |
   *       ----------------------------------
   * 
* * The edge denoted by `E` connects `v11` with `v1` and is assigned to processor `P6`. To get the * processor number we divide the matrix into `sqrt(numParts)` by `sqrt(numParts)` blocks. Notice * that edges adjacent to `v11` can only be in the first column of blocks `(P0, P3, * P6)` or the last * row of blocks `(P6, P7, P8)`. As a consequence we can guarantee that `v11` will need to be * replicated to at most `2 * sqrt(numParts)` machines. * * Notice that `P0` has many edges and as a consequence this partitioning would lead to poor work * balance. To improve balance we first multiply each vertex id by a large prime to shuffle the * vertex locations. * * When the number of partitions requested is not a perfect square we use a slightly different * method where the last column can have a different number of rows than the others while still * maintaining the same size per block. */ case object EdgePartition2D extends PartitionStrategy { override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = { val ceilSqrtNumParts: PartitionID = math.ceil(math.sqrt(numParts)).toInt val mixingPrime: VertexId = 1125899906842597L if (numParts == ceilSqrtNumParts * ceilSqrtNumParts) { // Use old method for perfect squared to ensure we get same results val col: PartitionID = (math.abs(src * mixingPrime) % ceilSqrtNumParts).toInt val row: PartitionID = (math.abs(dst * mixingPrime) % ceilSqrtNumParts).toInt (col * ceilSqrtNumParts + row) % numParts } else { // Otherwise use new method val cols = ceilSqrtNumParts val rows = (numParts + cols - 1) / cols val lastColRows = numParts - rows * (cols - 1) val col = (math.abs(src * mixingPrime) % numParts / rows).toInt val row = (math.abs(dst * mixingPrime) % (if (col < cols - 1) rows else lastColRows)).toInt col * rows + row } } } /** * Assigns edges to partitions using only the source vertex ID, colocating edges with the same * source. */ case object EdgePartition1D extends PartitionStrategy { override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = { val mixingPrime: VertexId = 1125899906842597L (math.abs(src * mixingPrime) % numParts).toInt } } /** * Assigns edges to partitions by hashing the source and destination vertex IDs, resulting in a * random vertex cut that colocates all same-direction edges between two vertices. */ case object RandomVertexCut extends PartitionStrategy { override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = { math.abs((src, dst).hashCode()) % numParts } } /** * Assigns edges to partitions by hashing the source and destination vertex IDs in a canonical * direction, resulting in a random vertex cut that colocates all edges between two vertices, * regardless of direction. */ case object CanonicalRandomVertexCut extends PartitionStrategy { override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = { if (src < dst) { math.abs((src, dst).hashCode()) % numParts } else { math.abs((dst, src).hashCode()) % numParts } } } /** Returns the PartitionStrategy with the specified name. */ def fromString(s: String): PartitionStrategy = s match { case "RandomVertexCut" => RandomVertexCut case "EdgePartition1D" => EdgePartition1D case "EdgePartition2D" => EdgePartition2D case "CanonicalRandomVertexCut" => CanonicalRandomVertexCut case _ => throw new IllegalArgumentException("Invalid PartitionStrategy: " + s) } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy