org.apache.mahout.flinkbindings.FlinkEngine.scala Maven / Gradle / Ivy

Go to download
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.mahout.flinkbindings

import org.apache.flink.api.common.functions.MapFunction
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor
import org.apache.flink.api.scala._
import org.apache.flink.api.scala.utils.DataSetUtils
import org.apache.hadoop.io.{IntWritable, LongWritable, Text}
import org.apache.mahout.flinkbindings.blas._
import org.apache.mahout.flinkbindings.drm._
import org.apache.mahout.flinkbindings.io.{HDFSUtil, Hadoop2HDFSUtil}
import org.apache.mahout.math._
import org.apache.mahout.math.drm._
import org.apache.mahout.math.drm.logical._
import org.apache.mahout.math.indexeddataset.{BiDictionary, IndexedDataset, Schema}
import org.apache.mahout.math.scalabindings.RLikeOps._
import org.apache.mahout.math.scalabindings._

import scala.collection.JavaConversions._
import scala.reflect._

object FlinkEngine extends DistributedEngine {

  // By default, use Hadoop 2 utils
  var hdfsUtils: HDFSUtil = Hadoop2HDFSUtil

  /**
    * Load DRM from hdfs (as in Mahout DRM format).
    *
    * @param path   The DFS path to load from
    * @param parMin Minimum parallelism after load (equivalent to #par(min=...)).
    */
  override def drmDfsRead(path: String, parMin: Int = 1)
                         (implicit dc: DistributedContext): CheckpointedDrm[_] = {

    // Require that context is actually Flink context.
    require(dc.isInstanceOf[FlinkDistributedContext], "Supplied context must be for the Flink backend.")

    // Extract the Flink Environment variable
    implicit val env = dc.asInstanceOf[FlinkDistributedContext].env

    // set the parallelism of the env to parMin
    env.setParallelism(parMin)

    // get the header of a SequenceFile in the path
    val metadata = hdfsUtils.readDrmHeader(path + "//")

    val keyClass: Class[_] = metadata.keyTypeWritable

    // from the header determine which function to use to unwrap the key
    val unwrapKey = metadata.unwrapKeyFunction

    // Map to the correct DrmLike based on the metadata information
    if (metadata.keyClassTag == ClassTag.Int) {
      val ds = env.readSequenceFile(classOf[IntWritable], classOf[VectorWritable], path)

      val res = ds.map(new MapFunction[(IntWritable, VectorWritable), (Int, Vector)] {
        def map(tuple: (IntWritable, VectorWritable)): (Int, Vector) = {
          (unwrapKey(tuple._1).asInstanceOf[Int], tuple._2.get())
        }
      })
      datasetWrap(res)(metadata.keyClassTag.asInstanceOf[ClassTag[Int]])
    } else if (metadata.keyClassTag == ClassTag.Long) {
      val ds = env.readSequenceFile(classOf[LongWritable], classOf[VectorWritable], path)

      val res = ds.map(new MapFunction[(LongWritable, VectorWritable), (Long, Vector)] {
        def map(tuple: (LongWritable, VectorWritable)): (Long, Vector) = {
          (unwrapKey(tuple._1).asInstanceOf[Long], tuple._2.get())
        }
      })
      datasetWrap(res)(metadata.keyClassTag.asInstanceOf[ClassTag[Long]])
    } else if (metadata.keyClassTag == ClassTag(classOf[String])) {
      val ds = env.readSequenceFile(classOf[Text], classOf[VectorWritable], path)

      val res = ds.map(new MapFunction[(Text, VectorWritable), (String, Vector)] {
        def map(tuple: (Text, VectorWritable)): (String, Vector) = {
          (unwrapKey(tuple._1).asInstanceOf[String], tuple._2.get())
        }
      })
      datasetWrap(res)(metadata.keyClassTag.asInstanceOf[ClassTag[String]])
    } else throw new IllegalArgumentException(s"Unsupported DRM key type:${keyClass.getName}")

  }

  override def indexedDatasetDFSRead(src: String, schema: Schema, existingRowIDs: Option[BiDictionary])
                                    (implicit sc: DistributedContext): IndexedDataset = ???

  override def indexedDatasetDFSReadElements(src: String, schema: Schema, existingRowIDs: Option[BiDictionary])
                                            (implicit sc: DistributedContext): IndexedDataset = ???


  /**
    * Perform default expression rewrite. Return physical plan that we can pass to exec(). 
    *
    * A particular physical engine implementation may choose to either use or not use these rewrites
    * as a useful basic rewriting rule.
    */
  override def optimizerRewrite[K: ClassTag](action: DrmLike[K]): DrmLike[K] = super.optimizerRewrite(action)

  /**
    * Translates logical plan into Flink execution plan.
    **/
  override def toPhysical[K: ClassTag](plan: DrmLike[K], ch: CacheHint.CacheHint): CheckpointedDrm[K] = {
    // Flink-specific Physical Plan translation.

    implicit val typeInformation = generateTypeInformation[K]
    val drm = flinkTranslate(plan)
    val newcp = new CheckpointedFlinkDrm(ds = drm.asRowWise.ds, _nrow = plan.nrow, _ncol = plan.ncol)
    newcp.cache()
  }


  private def flinkTranslate[K](oper: DrmLike[K]): FlinkDrm[K] = {
    implicit val kTag = oper.keyClassTag
    implicit val typeInformation = generateTypeInformation[K]
    oper match {
      case OpAtAnyKey(_) ⇒
        throw new IllegalArgumentException("\"A\" must be Int-keyed in this A.t expression.")
      case op@OpAx(a, x) ⇒
        FlinkOpAx.blockifiedBroadcastAx(op, flinkTranslate(a))
      case op@OpAt(a) if op.keyClassTag == ClassTag.Int ⇒ FlinkOpAt.sparseTrick(op, flinkTranslate(a)).asInstanceOf[FlinkDrm[K]]
      case op@OpAtx(a, x) if op.keyClassTag == ClassTag.Int ⇒
        FlinkOpAx.atx_with_broadcast(op, flinkTranslate(a)).asInstanceOf[FlinkDrm[K]]
      case op@OpAtB(a, b) ⇒ FlinkOpAtB.notZippable(op, flinkTranslate(a),
        flinkTranslate(b)).asInstanceOf[FlinkDrm[K]]
      case op@OpABt(a, b) ⇒
        // express ABt via AtB: let C=At and D=Bt, and calculate CtD
        // TODO: create specific implementation of ABt, see MAHOUT-1750
        val opAt = OpAt(a.asInstanceOf[DrmLike[Int]]) // TODO: casts!
        val at = FlinkOpAt.sparseTrick(opAt, flinkTranslate(a.asInstanceOf[DrmLike[Int]]))
        val c = new CheckpointedFlinkDrm(at.asRowWise.ds, _nrow = opAt.nrow, _ncol = opAt.ncol)
        val opBt = OpAt(b.asInstanceOf[DrmLike[Int]]) // TODO: casts!
        val bt = FlinkOpAt.sparseTrick(opBt, flinkTranslate(b.asInstanceOf[DrmLike[Int]]))
        val d = new CheckpointedFlinkDrm(bt.asRowWise.ds, _nrow = opBt.nrow, _ncol = opBt.ncol)
        FlinkOpAtB.notZippable(OpAtB(c, d), flinkTranslate(c), flinkTranslate(d)).asInstanceOf[FlinkDrm[K]]
      case op@OpAtA(a) if op.keyClassTag == ClassTag.Int ⇒ FlinkOpAtA.at_a(op, flinkTranslate(a)).asInstanceOf[FlinkDrm[K]]
      case op@OpTimesRightMatrix(a, b) ⇒
        FlinkOpTimesRightMatrix.drmTimesInCore(op, flinkTranslate(a), b)
      case op@OpAewUnaryFunc(a, _, _) ⇒
        FlinkOpAewScalar.opUnaryFunction(op, flinkTranslate(a))
      case op@OpAewUnaryFuncFusion(a, _) ⇒
        FlinkOpAewScalar.opUnaryFunction(op, flinkTranslate(a))
      // deprecated
      case op@OpAewScalar(a, scalar, _) ⇒
        FlinkOpAewScalar.opScalarNoSideEffect(op, flinkTranslate(a), scalar)
      case op@OpAewB(a, b, _) ⇒
        FlinkOpAewB.rowWiseJoinNoSideEffect(op, flinkTranslate(a), flinkTranslate(b))
      case op@OpCbind(a, b) ⇒
        FlinkOpCBind.cbind(op, flinkTranslate(a), flinkTranslate(b))
      case op@OpRbind(a, b) ⇒
        FlinkOpRBind.rbind(op, flinkTranslate(a), flinkTranslate(b))
      case op@OpCbindScalar(a, x, _) ⇒
        FlinkOpCBind.cbindScalar(op, flinkTranslate(a), x)
      case op@OpRowRange(a, _) ⇒
        FlinkOpRowRange.slice(op, flinkTranslate(a)).asInstanceOf[FlinkDrm[K]]
      case op@OpABAnyKey(a, b) if a.keyClassTag != b.keyClassTag ⇒
        throw new IllegalArgumentException("DRMs A and B have different indices, cannot multiply them")
      case op: OpMapBlock[_, K] ⇒
        FlinkOpMapBlock.apply(flinkTranslate(op.A), op.ncol, op)
      case cp: CheckpointedDrm[K] ⇒ cp
      case _ ⇒
        throw new NotImplementedError(s"operator $oper is not implemented yet")
    }
  }

  /**
    * returns a vector that contains a column-wise sum from DRM
    */
  override def colSums[K](drm: CheckpointedDrm[K]): Vector = {
    implicit val kTag: ClassTag[K] = drm.keyClassTag
    implicit val typeInformation = generateTypeInformation[K]


    val sum = drm.ds.map {
      tuple => tuple._2
    }.reduce(_ + _)

    val list = sum.collect
    list.head
  }

  /** Engine-specific numNonZeroElementsPerColumn implementation based on a checkpoint. */
  override def numNonZeroElementsPerColumn[K](drm: CheckpointedDrm[K]): Vector = {
    implicit val kTag: ClassTag[K] = drm.keyClassTag
    implicit val typeInformation = generateTypeInformation[K]


    val result = drm.asBlockified.ds.map {
      tuple =>
        val block = tuple._2
        val acc = block(0, ::).like()

        block.foreach { v =>
          v.nonZeroes().foreach { el => acc(el.index()) = acc(el.index()) + 1 }
        }

        acc
    }.reduce(_ + _)

    val list = result.collect
    list.head
  }

  /**
    * returns a vector that contains a column-wise mean from DRM
    */
  override def colMeans[K](drm: CheckpointedDrm[K]): Vector = {
    drm.colSums() / drm.nrow
  }

  /**
    * Calculates the element-wise squared norm of a matrix
    */
  override def norm[K](drm: CheckpointedDrm[K]): Double = {
    implicit val kTag: ClassTag[K] = drm.keyClassTag
    implicit val typeInformation = generateTypeInformation[K]

    val sumOfSquares = drm.ds.map {
      tuple => tuple match {
        case (idx, vec) => vec dot vec
      }
    }.reduce(_ + _)

    val list = sumOfSquares.collect

    // check on this --why is it returning a list?
    math.sqrt(list.head)
  }

  /** Broadcast support */
  override def drmBroadcast(v: Vector)(implicit dc: DistributedContext): BCast[Vector] =
    FlinkByteBCast.wrap(v)

  /** Broadcast support */
  override def drmBroadcast(m: Matrix)(implicit dc: DistributedContext): BCast[Matrix] =
    FlinkByteBCast.wrap(m)

  /** Parallelize in-core matrix as flink distributed matrix, using row ordinal indices as data set keys. */
  // The 'numPartitions' parameter is not honored in this call,
  // as Flink sets a global parallelism in ExecutionEnvironment
  override def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int = 1)
                                           (implicit dc: DistributedContext): CheckpointedDrm[Int] = {

    val parallelDrm = parallelize(m, numPartitions)

    new CheckpointedFlinkDrm(ds = parallelDrm, _nrow = m.numRows(), _ncol = m.numCols())
  }

  // The 'parallelismDegree' parameter is not honored in this call,
  // as Flink sets a global parallelism in ExecutionEnvironment
  private[flinkbindings] def parallelize(m: Matrix, parallelismDegree: Int)
                                        (implicit dc: DistributedContext): DrmDataSet[Int] = {
    val rows = (0 until m.nrow).map(i => (i, m(i, ::)))
    val dataSetType = TypeExtractor.getForObject(rows.head)
    dc.env.fromCollection(rows).partitionByRange(0)
  }

  /** Parallelize in-core matrix as flink distributed matrix, using row labels as a data set keys. */
  // The 'numPartitions' parameter is not honored in this call,
  // as Flink sets a global parallelism in ExecutionEnvironment
  override def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int = 1)
                                          (implicit dc: DistributedContext): CheckpointedDrm[String] = {

    val rb = m.getRowLabelBindings
    val p = for (i: String ← rb.keySet().toIndexedSeq) yield i → m(rb(i), ::)

    new CheckpointedFlinkDrm[String](dc.env.fromCollection(p),
      _nrow = m.nrow, _ncol = m.ncol, cacheHint = CacheHint.NONE)
  }

  /** This creates an empty DRM with specified number of partitions and cardinality. */
  override def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int = 10)
                                  (implicit dc: DistributedContext): CheckpointedDrm[Int] = {
    val nonParallelResult = (0 to numPartitions).flatMap { part ⇒
      val partNRow = (nrow - 1) / numPartitions + 1
      val partStart = partNRow * part
      val partEnd = Math.min(partStart + partNRow, nrow)

      for (i <- partStart until partEnd) yield (i, new RandomAccessSparseVector(ncol): Vector)
    }
    val result = dc.env.fromCollection(nonParallelResult)
    new CheckpointedFlinkDrm[Int](ds = result, _nrow = nrow, _ncol = ncol)
  }

  /** Creates empty DRM with non-trivial height */
  override def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int = 10)
                                      (implicit dc: DistributedContext): CheckpointedDrm[Long] = {

    val nonParallelResult = (0 to numPartitions).flatMap { part ⇒
        val partNRow = (nrow - 1) / numPartitions + 1
        val partStart = partNRow * part
        val partEnd = Math.min(partStart + partNRow, nrow)

      for (i ← partStart until partEnd) yield (i, new RandomAccessSparseVector(ncol): Vector)
    }

    val result = dc.env.fromCollection(nonParallelResult)
    new CheckpointedFlinkDrm[Long](ds = result, _nrow = nrow, _ncol = ncol, cacheHint = CacheHint.NONE)
  }

  /**
   * Convert non-int-keyed matrix to an int-keyed, computing optionally mapping from old keys
   * to row indices in the new one. The mapping, if requested, is returned as a 1-column matrix.
   */
  def drm2IntKeyed[K](drmX: DrmLike[K], computeMap: Boolean = false): (DrmLike[Int], Option[DrmLike[K]]) = {
    implicit val ktag = drmX.keyClassTag
    implicit val kTypeInformation = generateTypeInformation[K]

    if (ktag == ClassTag.Int) {
      drmX.asInstanceOf[DrmLike[Int]] → None
    } else {
      val drmXcp = drmX.checkpoint(CacheHint.MEMORY_ONLY)
      val ncol = drmXcp.asInstanceOf[CheckpointedFlinkDrm[K]].ncol
      val nrow = drmXcp.asInstanceOf[CheckpointedFlinkDrm[K]].nrow

      // Compute sequential int key numbering.
      val (intDataset, keyMap) = blas.rekeySeqInts(drmDataSet = drmXcp, computeMap = computeMap)

      // Convert computed key mapping to a matrix.
      val mxKeyMap = keyMap.map { dataSet ⇒
        datasetWrap(dataSet.map {
          tuple: (K, Int) => {
            val ordinal = tuple._2
            val key = tuple._1
            key -> (dvec(ordinal): Vector)
          }
        })
      }

      intDataset -> mxKeyMap
    }
  }

  /**
   * (Optional) Sampling operation.
   */
  def drmSampleRows[K](drmX: DrmLike[K], fraction: Double, replacement: Boolean = false): DrmLike[K] = {
    implicit val kTag: ClassTag[K] =  drmX.keyClassTag
    implicit val typeInformation = generateTypeInformation[K]

    val sample = DataSetUtils(drmX.dataset).sample(replacement, fraction)

    val res = if (kTag != ClassTag.Int) {
      new CheckpointedFlinkDrm[K](sample)
    }
    else {
      blas.rekeySeqInts(new RowsFlinkDrm[K](sample, ncol = drmX.ncol), computeMap = false)._1
        .asInstanceOf[DrmLike[K]]
    }

    res
  }

  def drmSampleKRows[K](drmX: DrmLike[K], numSamples:Int, replacement: Boolean = false): Matrix = {
    implicit val kTag: ClassTag[K] =  drmX.keyClassTag
    implicit val typeInformation = generateTypeInformation[K]

    val sample = DataSetUtils(drmX.dataset).sampleWithSize(replacement, numSamples)
    val sampleArray = sample.collect().toArray
    val isSparse = sampleArray.exists { case (_, vec) ⇒ !vec.isDense }

    val vectors = sampleArray.map(_._2)
    val labels = sampleArray.view.zipWithIndex
      .map { case ((key, _), idx) ⇒ key.toString → (idx: Integer) }.toMap

    val mx: Matrix = if (isSparse) sparse(vectors: _*) else dense(vectors)
    mx.setRowLabelBindings(labels)

    mx
  }

  /** Engine-specific all reduce tensor operation. */
  def allreduceBlock[K](drm: CheckpointedDrm[K], bmf: BlockMapFunc2[K], rf: BlockReduceFunc): Matrix = {
    implicit val kTag: ClassTag[K] = drm.keyClassTag
    implicit val typeInformation = generateTypeInformation[K]

    val res = drm.asBlockified.ds.map(par => bmf(par)).reduce(rf)
    res.collect().head
  }

  def generateTypeInformation[K: ClassTag]: TypeInformation[K] = {
    implicit val ktag = classTag[K]

    generateTypeInformationFromTag(ktag)
  }

  private def generateTypeInformationFromTag[K](tag: ClassTag[K]): TypeInformation[K] = {
    if (tag.runtimeClass.equals(classOf[Int])) {
      createTypeInformation[Int].asInstanceOf[TypeInformation[K]]
    } else if (tag.runtimeClass.equals(classOf[Long])) {
      createTypeInformation[Long].asInstanceOf[TypeInformation[K]]
    } else if (tag.runtimeClass.equals(classOf[String])) {
      createTypeInformation[String].asInstanceOf[TypeInformation[K]]
    } else {
      throw new IllegalArgumentException(s"index type $tag is not supported")
    }
  }
}