All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.linalg.distributed.CoordinateMatrix.scala Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.linalg.distributed

import breeze.linalg.{DenseMatrix => BDM}
import org.apache.spark.linalg.{Matrix, SparseMatrix, Vectors}
import org.apache.spark.annotation.Since
import org.apache.spark.rdd.RDD

/**
 * Represents an entry in a distributed matrix.
 * @param i row index
 * @param j column index
 * @param value value of the entry
 */

case class MatrixEntry(i: Long, j: Long, value: Double)

/**
 * Represents a matrix in coordinate format.
 *
 * @param entries matrix entries
 * @param nRows number of rows. A non-positive value means unknown, and then the number of rows will
 *              be determined by the max row index plus one.
 * @param nCols number of columns. A non-positive value means unknown, and then the number of
 *              columns will be determined by the max column index plus one.
 */

class CoordinateMatrix  (
     val entries: RDD[MatrixEntry],
    private var nRows: Long,
    private var nCols: Long) extends DistributedMatrix {

  /** Alternative constructor leaving matrix dimensions to be determined automatically. */

  def this(entries: RDD[MatrixEntry]) = this(entries, 0L, 0L)

  /** Gets or computes the number of columns. */

  override def numCols(): Long = {
    if (nCols <= 0L) {
      computeSize()
    }
    nCols
  }

  /** Gets or computes the number of rows. */

  override def numRows(): Long = {
    if (nRows <= 0L) {
      computeSize()
    }
    nRows
  }

  /** Transposes this CoordinateMatrix. */

  def transpose(): CoordinateMatrix = {
    new CoordinateMatrix(entries.map(x => MatrixEntry(x.j, x.i, x.value)), numCols(), numRows())
  }

  /** Converts to IndexedRowMatrix. The number of columns must be within the integer range. */

  def toIndexedRowMatrix(): IndexedRowMatrix = {
    val nl = numCols()
    if (nl > Int.MaxValue) {
      sys.error(s"Cannot convert to a row-oriented format because the number of columns $nl is " +
        "too large.")
    }
    val n = nl.toInt
    val indexedRows = entries.map(entry => (entry.i, (entry.j.toInt, entry.value)))
      .groupByKey()
      .map { case (i, vectorEntries) =>
        IndexedRow(i, Vectors.sparse(n, vectorEntries.toSeq))
      }
    new IndexedRowMatrix(indexedRows, numRows(), n)
  }

  /**
   * Converts to RowMatrix, dropping row indices after grouping by row index.
   * The number of columns must be within the integer range.
   */

  def toRowMatrix(): RowMatrix = {
    toIndexedRowMatrix().toRowMatrix()
  }

  /**
   * Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024.
   */

  def toBlockMatrix(): BlockMatrix = {
    toBlockMatrix(1024, 1024)
  }

  /**
   * Converts to BlockMatrix. Creates blocks of `SparseMatrix`.
   * @param rowsPerBlock The number of rows of each block. The blocks at the bottom edge may have
   *                     a smaller value. Must be an integer value greater than 0.
   * @param colsPerBlock The number of columns of each block. The blocks at the right edge may have
   *                     a smaller value. Must be an integer value greater than 0.
   * @return a [[BlockMatrix]]
   */

  def toBlockMatrix(rowsPerBlock: Int, colsPerBlock: Int): BlockMatrix = {
    require(rowsPerBlock > 0,
      s"rowsPerBlock needs to be greater than 0. rowsPerBlock: $rowsPerBlock")
    require(colsPerBlock > 0,
      s"colsPerBlock needs to be greater than 0. colsPerBlock: $colsPerBlock")
    val m = numRows()
    val n = numCols()

    // Since block matrices require an integer row and col index
    require(math.ceil(m.toDouble / rowsPerBlock) <= Int.MaxValue,
      "Number of rows divided by rowsPerBlock cannot exceed maximum integer.")
    require(math.ceil(n.toDouble / colsPerBlock) <= Int.MaxValue,
      "Number of cols divided by colsPerBlock cannot exceed maximum integer.")

    val numRowBlocks = math.ceil(m.toDouble / rowsPerBlock).toInt
    val numColBlocks = math.ceil(n.toDouble / colsPerBlock).toInt
    val partitioner = GridPartitioner(numRowBlocks, numColBlocks, entries.partitions.length)

    val blocks: RDD[((Int, Int), Matrix)] = entries.map { entry =>
      val blockRowIndex = (entry.i / rowsPerBlock).toInt
      val blockColIndex = (entry.j / colsPerBlock).toInt

      val rowId = entry.i % rowsPerBlock
      val colId = entry.j % colsPerBlock

      ((blockRowIndex, blockColIndex), (rowId.toInt, colId.toInt, entry.value))
    }.groupByKey(partitioner).map { case ((blockRowIndex, blockColIndex), entry) =>
      val effRows = math.min(m - blockRowIndex.toLong * rowsPerBlock, rowsPerBlock).toInt
      val effCols = math.min(n - blockColIndex.toLong * colsPerBlock, colsPerBlock).toInt
      ((blockRowIndex, blockColIndex), SparseMatrix.fromCOO(effRows, effCols, entry))
    }
    new BlockMatrix(blocks, rowsPerBlock, colsPerBlock, m, n)
  }

  /** Determines the size by computing the max row/column index. */
  private def computeSize() {
    // Reduce will throw an exception if `entries` is empty.
    val (m1, n1) = entries.map(entry => (entry.i, entry.j)).reduce { case ((i1, j1), (i2, j2)) =>
      (math.max(i1, i2), math.max(j1, j2))
    }
    // There may be empty columns at the very right and empty rows at the very bottom.
    nRows = math.max(nRows, m1 + 1L)
    nCols = math.max(nCols, n1 + 1L)
  }

  /** Collects data and assembles a local matrix. */
  override def toBreeze(): BDM[Double] = {
    val m = numRows().toInt
    val n = numCols().toInt
    val mat = BDM.zeros[Double](m, n)
    entries.collect().foreach { case MatrixEntry(i, j, value) =>
      mat(i.toInt, j.toInt) = value
    }
    mat
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy