org.apache.spark.linalg.distributed.IndexedRowMatrix.scala Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.linalg.distributed

import breeze.linalg.{DenseMatrix => BDM}
import org.apache.spark.linalg._
import org.apache.spark.annotation.Since
import org.apache.spark.rdd.RDD

/**
 * Represents a row of [[IndexedRowMatrix]].
 */

case class IndexedRow(index: Long, vector: Vector)

/**
 * Represents a row-oriented [[DistributedMatrix]] with
 * indexed rows.
 *
 * @param rows indexed rows of this matrix
 * @param nRows number of rows. A non-positive value means unknown, and then the number of rows will
 *              be determined by the max row index plus one.
 * @param nCols number of columns. A non-positive value means unknown, and then the number of
 *              columns will be determined by the size of the first row.
 */

class IndexedRowMatrix  (
     val rows: RDD[IndexedRow],
    private var nRows: Long,
    private var nCols: Int) extends DistributedMatrix {

  /** Alternative constructor leaving matrix dimensions to be determined automatically. */

  def this(rows: RDD[IndexedRow]) = this(rows, 0L, 0)


  override def numCols(): Long = {
    if (nCols <= 0) {
      // Calling `first` will throw an exception if `rows` is empty.
      nCols = rows.first().vector.size.toInt
    }
    nCols
  }


  override def numRows(): Long = {
    if (nRows <= 0L) {
      // Reduce will throw an exception if `rows` is empty.
      nRows = rows.map(_.index).reduce(math.max) + 1L
    }
    nRows
  }


  /**
   * Compute all cosine similarities between columns of this matrix using the brute-force
   * approach of computing normalized dot products.
   *
   * @return An n x n sparse upper-triangular matrix of cosine similarities between
   *         columns of this matrix.
   */

  def columnSimilarities(): CoordinateMatrix = {
    toRowMatrix().columnSimilarities()
  }

  /**
   * Drops row indices and converts this matrix to a
   * [[RowMatrix]].
   */

  def toRowMatrix(): RowMatrix = {
    new RowMatrix(rows.map(_.vector), 0L, nCols)
  }

  /**
   * Converts to BlockMatrix. Creates blocks with size 1024 x 1024.
   */

  def toBlockMatrix(): BlockMatrix = {
    toBlockMatrix(1024, 1024)
  }

  /**
   * Converts to BlockMatrix. Blocks may be sparse or dense depending on the sparsity of the rows.
   * @param rowsPerBlock The number of rows of each block. The blocks at the bottom edge may have
   *                     a smaller value. Must be an integer value greater than 0.
   * @param colsPerBlock The number of columns of each block. The blocks at the right edge may have
   *                     a smaller value. Must be an integer value greater than 0.
   * @return a [[BlockMatrix]]
   */

  def toBlockMatrix(rowsPerBlock: Int, colsPerBlock: Int): BlockMatrix = {
    require(rowsPerBlock > 0,
      s"rowsPerBlock needs to be greater than 0. rowsPerBlock: $rowsPerBlock")
    require(colsPerBlock > 0,
      s"colsPerBlock needs to be greater than 0. colsPerBlock: $colsPerBlock")

    val m = numRows()
    val n = numCols()

    // Since block matrices require an integer row index
    require(math.ceil(m.toDouble / rowsPerBlock) <= Int.MaxValue,
      "Number of rows divided by rowsPerBlock cannot exceed maximum integer.")

    // The remainder calculations only matter when m % rowsPerBlock != 0 or n % colsPerBlock != 0
    val remainderRowBlockIndex = m / rowsPerBlock
    val remainderColBlockIndex = n / colsPerBlock
    val remainderRowBlockSize = (m % rowsPerBlock).toInt
    val remainderColBlockSize = (n % colsPerBlock).toInt
    val numRowBlocks = math.ceil(m.toDouble / rowsPerBlock).toInt
    val numColBlocks = math.ceil(n.toDouble / colsPerBlock).toInt

    val blocks = rows.flatMap { ir: IndexedRow =>
      val blockRow = ir.index / rowsPerBlock
      val rowInBlock = ir.index % rowsPerBlock

      ir.vector match {
        case IntSparseVector(size, indices, values) =>
          indices.zip(values).map { case (index, value) =>
            val blockColumn = index / colsPerBlock
            val columnInBlock = index % colsPerBlock
            ((blockRow.toInt, blockColumn.toInt), (rowInBlock.toInt, Array((value, columnInBlock))))
          }
        case DenseVector(values) =>
          values.grouped(colsPerBlock)
            .zipWithIndex
            .map { case (values, blockColumn) =>
              ((blockRow.toInt, blockColumn), (rowInBlock.toInt, values.zipWithIndex))
            }
      }
    }.groupByKey(GridPartitioner(numRowBlocks, numColBlocks, rows.getNumPartitions)).map {
      case ((blockRow, blockColumn), itr) =>
        val actualNumRows =
          if (blockRow == remainderRowBlockIndex) remainderRowBlockSize else rowsPerBlock
        val actualNumColumns =
          if (blockColumn == remainderColBlockIndex) remainderColBlockSize else colsPerBlock

        val arraySize = actualNumRows * actualNumColumns
        val matrixAsArray = new Array[Double](arraySize)
        var countForValues = 0
        itr.foreach { case (rowWithinBlock, valuesWithColumns) =>
          valuesWithColumns.foreach { case (value, columnWithinBlock) =>
            matrixAsArray.update(columnWithinBlock * actualNumRows + rowWithinBlock, value)
            countForValues += 1
          }
        }
        val denseMatrix = new DenseMatrix(actualNumRows, actualNumColumns, matrixAsArray)
        val finalMatrix = if (countForValues / arraySize.toDouble >= 0.1) {
          denseMatrix
        } else {
          denseMatrix.toSparse
        }

        ((blockRow, blockColumn), finalMatrix)
    }
    new BlockMatrix(blocks, rowsPerBlock, colsPerBlock, m, n)
  }

  /**
   * Converts this matrix to a
   * [[CoordinateMatrix]].
   */

  def toCoordinateMatrix(): CoordinateMatrix = {
    val entries = rows.flatMap { row =>
      val rowIndex = row.index
      row.vector match {
        case IntSparseVector(size, indices, values) =>
          Iterator.tabulate(indices.length)(i => MatrixEntry(rowIndex, indices(i), values(i)))
        case DenseVector(values) =>
          Iterator.tabulate(values.length)(i => MatrixEntry(rowIndex, i, values(i)))
      }
    }
    new CoordinateMatrix(entries, numRows(), numCols())
  }

  /**
   * Computes the singular value decomposition of this IndexedRowMatrix.
   * Denote this matrix by A (m x n), this will compute matrices U, S, V such that A = U * S * V'.
   *
   * The cost and implementation of this method is identical to that in
   * [[RowMatrix]]
   * With the addition of indices.
   *
   * At most k largest non-zero singular values and associated vectors are returned.
   * If there are k such values, then the dimensions of the return will be:
   *
   * U is an [[IndexedRowMatrix]] of size m x k that
   * satisfies U'U = eye(k),
   * s is a Vector of size k, holding the singular values in descending order,
   * and V is a local Matrix of size n x k that satisfies V'V = eye(k).
   *
   * @param k number of singular values to keep. We might return less than k if there are
   *          numerically zero singular values. See rCond.
   * @param computeU whether to compute U
   * @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0)
   *              are treated as zero, where sigma(0) is the largest singular value.
   * @return SingularValueDecomposition(U, s, V)
   */

  def computeSVD(
      k: Int,
      computeU: Boolean = false,
      rCond: Double = 1e-9): SingularValueDecomposition[IndexedRowMatrix, Matrix] = {

    val n = numCols().toInt
    require(k > 0 && k <= n, s"Requested k singular values but got k=$k and numCols=$n.")
    val indices = rows.map(_.index)
    val svd = toRowMatrix().computeSVD(k, computeU, rCond)
    val U = if (computeU) {
      val indexedRows = indices.zip(svd.U.rows).map { case (i, v) =>
        IndexedRow(i, v)
      }
      new IndexedRowMatrix(indexedRows, nRows, svd.U.numCols().toInt)
    } else {
      null
    }
    SingularValueDecomposition(U, svd.s, svd.V)
  }

  /**
   * Multiply this matrix by a local matrix on the right.
   *
   * @param B a local matrix whose number of rows must match the number of columns of this matrix
   * @return an IndexedRowMatrix representing the product, which preserves partitioning
   */

  def multiply(B: Matrix): IndexedRowMatrix = {
    val mat = toRowMatrix().multiply(B)
    val indexedRows = rows.map(_.index).zip(mat.rows).map { case (i, v) =>
      distributed.IndexedRow(i, v)
    }
    new IndexedRowMatrix(indexedRows, nRows, B.numCols)
  }

  /**
   * Computes the Gramian matrix `A^T A`.
   *
   * @note This cannot be computed on matrices with more than 65535 columns.
   */

  def computeGramianMatrix(): Matrix = {
    toRowMatrix().computeGramianMatrix()
  }

  override def toBreeze(): BDM[Double] = {
    val m = numRows().toInt
    val n = numCols().toInt
    val mat = BDM.zeros[Double](m, n)
    rows.collect().foreach { case IndexedRow(rowIndex, vector) =>
      val i = rowIndex.toInt
      vector.foreachActive { case (j, v) =>
        mat(i, j.toInt) = v
      }
    }
    mat
  }
}