All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.flinkbindings.FlinkEngine.scala Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.mahout.flinkbindings

import org.apache.flink.api.common.functions.MapFunction
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor
import org.apache.flink.api.scala._
import org.apache.flink.api.scala.utils.DataSetUtils
import org.apache.hadoop.io.{IntWritable, LongWritable, Text}
import org.apache.mahout.flinkbindings.blas._
import org.apache.mahout.flinkbindings.drm._
import org.apache.mahout.flinkbindings.io.{HDFSUtil, Hadoop2HDFSUtil}
import org.apache.mahout.math._
import org.apache.mahout.math.drm._
import org.apache.mahout.math.drm.logical._
import org.apache.mahout.math.indexeddataset.{BiDictionary, IndexedDataset, Schema}
import org.apache.mahout.math.scalabindings.RLikeOps._
import org.apache.mahout.math.scalabindings._

import scala.collection.JavaConversions._
import scala.reflect._

object FlinkEngine extends DistributedEngine {

  // By default, use Hadoop 2 utils
  var hdfsUtils: HDFSUtil = Hadoop2HDFSUtil

  /**
    * Load DRM from hdfs (as in Mahout DRM format).
    *
    * @param path   The DFS path to load from
    * @param parMin Minimum parallelism after load (equivalent to #par(min=...)).
    */
  override def drmDfsRead(path: String, parMin: Int = 1)
                         (implicit dc: DistributedContext): CheckpointedDrm[_] = {

    // Require that context is actually Flink context.
    require(dc.isInstanceOf[FlinkDistributedContext], "Supplied context must be for the Flink backend.")

    // Extract the Flink Environment variable
    implicit val env = dc.asInstanceOf[FlinkDistributedContext].env

    // set the parallelism of the env to parMin
    env.setParallelism(parMin)

    // get the header of a SequenceFile in the path
    val metadata = hdfsUtils.readDrmHeader(path + "//")

    val keyClass: Class[_] = metadata.keyTypeWritable

    // from the header determine which function to use to unwrap the key
    val unwrapKey = metadata.unwrapKeyFunction

    // Map to the correct DrmLike based on the metadata information
    if (metadata.keyClassTag == ClassTag.Int) {
      val ds = env.readSequenceFile(classOf[IntWritable], classOf[VectorWritable], path)

      val res = ds.map(new MapFunction[(IntWritable, VectorWritable), (Int, Vector)] {
        def map(tuple: (IntWritable, VectorWritable)): (Int, Vector) = {
          (unwrapKey(tuple._1).asInstanceOf[Int], tuple._2.get())
        }
      })
      datasetWrap(res)(metadata.keyClassTag.asInstanceOf[ClassTag[Int]])
    } else if (metadata.keyClassTag == ClassTag.Long) {
      val ds = env.readSequenceFile(classOf[LongWritable], classOf[VectorWritable], path)

      val res = ds.map(new MapFunction[(LongWritable, VectorWritable), (Long, Vector)] {
        def map(tuple: (LongWritable, VectorWritable)): (Long, Vector) = {
          (unwrapKey(tuple._1).asInstanceOf[Long], tuple._2.get())
        }
      })
      datasetWrap(res)(metadata.keyClassTag.asInstanceOf[ClassTag[Long]])
    } else if (metadata.keyClassTag == ClassTag(classOf[String])) {
      val ds = env.readSequenceFile(classOf[Text], classOf[VectorWritable], path)

      val res = ds.map(new MapFunction[(Text, VectorWritable), (String, Vector)] {
        def map(tuple: (Text, VectorWritable)): (String, Vector) = {
          (unwrapKey(tuple._1).asInstanceOf[String], tuple._2.get())
        }
      })
      datasetWrap(res)(metadata.keyClassTag.asInstanceOf[ClassTag[String]])
    } else throw new IllegalArgumentException(s"Unsupported DRM key type:${keyClass.getName}")

  }

  override def indexedDatasetDFSRead(src: String, schema: Schema, existingRowIDs: Option[BiDictionary])
                                    (implicit sc: DistributedContext): IndexedDataset = ???

  override def indexedDatasetDFSReadElements(src: String, schema: Schema, existingRowIDs: Option[BiDictionary])
                                            (implicit sc: DistributedContext): IndexedDataset = ???


  /**
    * Perform default expression rewrite. Return physical plan that we can pass to exec(). 

* * A particular physical engine implementation may choose to either use or not use these rewrites * as a useful basic rewriting rule.

*/ override def optimizerRewrite[K: ClassTag](action: DrmLike[K]): DrmLike[K] = super.optimizerRewrite(action) /** * Translates logical plan into Flink execution plan. **/ override def toPhysical[K: ClassTag](plan: DrmLike[K], ch: CacheHint.CacheHint): CheckpointedDrm[K] = { // Flink-specific Physical Plan translation. implicit val typeInformation = generateTypeInformation[K] val drm = flinkTranslate(plan) val newcp = new CheckpointedFlinkDrm(ds = drm.asRowWise.ds, _nrow = plan.nrow, _ncol = plan.ncol) newcp.cache() } private def flinkTranslate[K](oper: DrmLike[K]): FlinkDrm[K] = { implicit val kTag = oper.keyClassTag implicit val typeInformation = generateTypeInformation[K] oper match { case OpAtAnyKey(_) ⇒ throw new IllegalArgumentException("\"A\" must be Int-keyed in this A.t expression.") case op@OpAx(a, x) ⇒ FlinkOpAx.blockifiedBroadcastAx(op, flinkTranslate(a)) case op@OpAt(a) if op.keyClassTag == ClassTag.Int ⇒ FlinkOpAt.sparseTrick(op, flinkTranslate(a)).asInstanceOf[FlinkDrm[K]] case op@OpAtx(a, x) if op.keyClassTag == ClassTag.Int ⇒ FlinkOpAx.atx_with_broadcast(op, flinkTranslate(a)).asInstanceOf[FlinkDrm[K]] case op@OpAtB(a, b) ⇒ FlinkOpAtB.notZippable(op, flinkTranslate(a), flinkTranslate(b)).asInstanceOf[FlinkDrm[K]] case op@OpABt(a, b) ⇒ // express ABt via AtB: let C=At and D=Bt, and calculate CtD // TODO: create specific implementation of ABt, see MAHOUT-1750 val opAt = OpAt(a.asInstanceOf[DrmLike[Int]]) // TODO: casts! val at = FlinkOpAt.sparseTrick(opAt, flinkTranslate(a.asInstanceOf[DrmLike[Int]])) val c = new CheckpointedFlinkDrm(at.asRowWise.ds, _nrow = opAt.nrow, _ncol = opAt.ncol) val opBt = OpAt(b.asInstanceOf[DrmLike[Int]]) // TODO: casts! val bt = FlinkOpAt.sparseTrick(opBt, flinkTranslate(b.asInstanceOf[DrmLike[Int]])) val d = new CheckpointedFlinkDrm(bt.asRowWise.ds, _nrow = opBt.nrow, _ncol = opBt.ncol) FlinkOpAtB.notZippable(OpAtB(c, d), flinkTranslate(c), flinkTranslate(d)).asInstanceOf[FlinkDrm[K]] case op@OpAtA(a) if op.keyClassTag == ClassTag.Int ⇒ FlinkOpAtA.at_a(op, flinkTranslate(a)).asInstanceOf[FlinkDrm[K]] case op@OpTimesRightMatrix(a, b) ⇒ FlinkOpTimesRightMatrix.drmTimesInCore(op, flinkTranslate(a), b) case op@OpAewUnaryFunc(a, _, _) ⇒ FlinkOpAewScalar.opUnaryFunction(op, flinkTranslate(a)) case op@OpAewUnaryFuncFusion(a, _) ⇒ FlinkOpAewScalar.opUnaryFunction(op, flinkTranslate(a)) // deprecated case op@OpAewScalar(a, scalar, _) ⇒ FlinkOpAewScalar.opScalarNoSideEffect(op, flinkTranslate(a), scalar) case op@OpAewB(a, b, _) ⇒ FlinkOpAewB.rowWiseJoinNoSideEffect(op, flinkTranslate(a), flinkTranslate(b)) case op@OpCbind(a, b) ⇒ FlinkOpCBind.cbind(op, flinkTranslate(a), flinkTranslate(b)) case op@OpRbind(a, b) ⇒ FlinkOpRBind.rbind(op, flinkTranslate(a), flinkTranslate(b)) case op@OpCbindScalar(a, x, _) ⇒ FlinkOpCBind.cbindScalar(op, flinkTranslate(a), x) case op@OpRowRange(a, _) ⇒ FlinkOpRowRange.slice(op, flinkTranslate(a)).asInstanceOf[FlinkDrm[K]] case op@OpABAnyKey(a, b) if a.keyClassTag != b.keyClassTag ⇒ throw new IllegalArgumentException("DRMs A and B have different indices, cannot multiply them") case op: OpMapBlock[_, K] ⇒ FlinkOpMapBlock.apply(flinkTranslate(op.A), op.ncol, op) case cp: CheckpointedDrm[K] ⇒ cp case _ ⇒ throw new NotImplementedError(s"operator $oper is not implemented yet") } } /** * returns a vector that contains a column-wise sum from DRM */ override def colSums[K](drm: CheckpointedDrm[K]): Vector = { implicit val kTag: ClassTag[K] = drm.keyClassTag implicit val typeInformation = generateTypeInformation[K] val sum = drm.ds.map { tuple => tuple._2 }.reduce(_ + _) val list = sum.collect list.head } /** Engine-specific numNonZeroElementsPerColumn implementation based on a checkpoint. */ override def numNonZeroElementsPerColumn[K](drm: CheckpointedDrm[K]): Vector = { implicit val kTag: ClassTag[K] = drm.keyClassTag implicit val typeInformation = generateTypeInformation[K] val result = drm.asBlockified.ds.map { tuple => val block = tuple._2 val acc = block(0, ::).like() block.foreach { v => v.nonZeroes().foreach { el => acc(el.index()) = acc(el.index()) + 1 } } acc }.reduce(_ + _) val list = result.collect list.head } /** * returns a vector that contains a column-wise mean from DRM */ override def colMeans[K](drm: CheckpointedDrm[K]): Vector = { drm.colSums() / drm.nrow } /** * Calculates the element-wise squared norm of a matrix */ override def norm[K](drm: CheckpointedDrm[K]): Double = { implicit val kTag: ClassTag[K] = drm.keyClassTag implicit val typeInformation = generateTypeInformation[K] val sumOfSquares = drm.ds.map { tuple => tuple match { case (idx, vec) => vec dot vec } }.reduce(_ + _) val list = sumOfSquares.collect // check on this --why is it returning a list? math.sqrt(list.head) } /** Broadcast support */ override def drmBroadcast(v: Vector)(implicit dc: DistributedContext): BCast[Vector] = FlinkByteBCast.wrap(v) /** Broadcast support */ override def drmBroadcast(m: Matrix)(implicit dc: DistributedContext): BCast[Matrix] = FlinkByteBCast.wrap(m) /** Parallelize in-core matrix as flink distributed matrix, using row ordinal indices as data set keys. */ // The 'numPartitions' parameter is not honored in this call, // as Flink sets a global parallelism in ExecutionEnvironment override def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int = 1) (implicit dc: DistributedContext): CheckpointedDrm[Int] = { val parallelDrm = parallelize(m, numPartitions) new CheckpointedFlinkDrm(ds = parallelDrm, _nrow = m.numRows(), _ncol = m.numCols()) } // The 'parallelismDegree' parameter is not honored in this call, // as Flink sets a global parallelism in ExecutionEnvironment private[flinkbindings] def parallelize(m: Matrix, parallelismDegree: Int) (implicit dc: DistributedContext): DrmDataSet[Int] = { val rows = (0 until m.nrow).map(i => (i, m(i, ::))) val dataSetType = TypeExtractor.getForObject(rows.head) dc.env.fromCollection(rows).partitionByRange(0) } /** Parallelize in-core matrix as flink distributed matrix, using row labels as a data set keys. */ // The 'numPartitions' parameter is not honored in this call, // as Flink sets a global parallelism in ExecutionEnvironment override def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int = 1) (implicit dc: DistributedContext): CheckpointedDrm[String] = { val rb = m.getRowLabelBindings val p = for (i: String ← rb.keySet().toIndexedSeq) yield i → m(rb(i), ::) new CheckpointedFlinkDrm[String](dc.env.fromCollection(p), _nrow = m.nrow, _ncol = m.ncol, cacheHint = CacheHint.NONE) } /** This creates an empty DRM with specified number of partitions and cardinality. */ override def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int = 10) (implicit dc: DistributedContext): CheckpointedDrm[Int] = { val nonParallelResult = (0 to numPartitions).flatMap { part ⇒ val partNRow = (nrow - 1) / numPartitions + 1 val partStart = partNRow * part val partEnd = Math.min(partStart + partNRow, nrow) for (i <- partStart until partEnd) yield (i, new RandomAccessSparseVector(ncol): Vector) } val result = dc.env.fromCollection(nonParallelResult) new CheckpointedFlinkDrm[Int](ds = result, _nrow = nrow, _ncol = ncol) } /** Creates empty DRM with non-trivial height */ override def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int = 10) (implicit dc: DistributedContext): CheckpointedDrm[Long] = { val nonParallelResult = (0 to numPartitions).flatMap { part ⇒ val partNRow = (nrow - 1) / numPartitions + 1 val partStart = partNRow * part val partEnd = Math.min(partStart + partNRow, nrow) for (i ← partStart until partEnd) yield (i, new RandomAccessSparseVector(ncol): Vector) } val result = dc.env.fromCollection(nonParallelResult) new CheckpointedFlinkDrm[Long](ds = result, _nrow = nrow, _ncol = ncol, cacheHint = CacheHint.NONE) } /** * Convert non-int-keyed matrix to an int-keyed, computing optionally mapping from old keys * to row indices in the new one. The mapping, if requested, is returned as a 1-column matrix. */ def drm2IntKeyed[K](drmX: DrmLike[K], computeMap: Boolean = false): (DrmLike[Int], Option[DrmLike[K]]) = { implicit val ktag = drmX.keyClassTag implicit val kTypeInformation = generateTypeInformation[K] if (ktag == ClassTag.Int) { drmX.asInstanceOf[DrmLike[Int]] → None } else { val drmXcp = drmX.checkpoint(CacheHint.MEMORY_ONLY) val ncol = drmXcp.asInstanceOf[CheckpointedFlinkDrm[K]].ncol val nrow = drmXcp.asInstanceOf[CheckpointedFlinkDrm[K]].nrow // Compute sequential int key numbering. val (intDataset, keyMap) = blas.rekeySeqInts(drmDataSet = drmXcp, computeMap = computeMap) // Convert computed key mapping to a matrix. val mxKeyMap = keyMap.map { dataSet ⇒ datasetWrap(dataSet.map { tuple: (K, Int) => { val ordinal = tuple._2 val key = tuple._1 key -> (dvec(ordinal): Vector) } }) } intDataset -> mxKeyMap } } /** * (Optional) Sampling operation. */ def drmSampleRows[K](drmX: DrmLike[K], fraction: Double, replacement: Boolean = false): DrmLike[K] = { implicit val kTag: ClassTag[K] = drmX.keyClassTag implicit val typeInformation = generateTypeInformation[K] val sample = DataSetUtils(drmX.dataset).sample(replacement, fraction) val res = if (kTag != ClassTag.Int) { new CheckpointedFlinkDrm[K](sample) } else { blas.rekeySeqInts(new RowsFlinkDrm[K](sample, ncol = drmX.ncol), computeMap = false)._1 .asInstanceOf[DrmLike[K]] } res } def drmSampleKRows[K](drmX: DrmLike[K], numSamples:Int, replacement: Boolean = false): Matrix = { implicit val kTag: ClassTag[K] = drmX.keyClassTag implicit val typeInformation = generateTypeInformation[K] val sample = DataSetUtils(drmX.dataset).sampleWithSize(replacement, numSamples) val sampleArray = sample.collect().toArray val isSparse = sampleArray.exists { case (_, vec) ⇒ !vec.isDense } val vectors = sampleArray.map(_._2) val labels = sampleArray.view.zipWithIndex .map { case ((key, _), idx) ⇒ key.toString → (idx: Integer) }.toMap val mx: Matrix = if (isSparse) sparse(vectors: _*) else dense(vectors) mx.setRowLabelBindings(labels) mx } /** Engine-specific all reduce tensor operation. */ def allreduceBlock[K](drm: CheckpointedDrm[K], bmf: BlockMapFunc2[K], rf: BlockReduceFunc): Matrix = { implicit val kTag: ClassTag[K] = drm.keyClassTag implicit val typeInformation = generateTypeInformation[K] val res = drm.asBlockified.ds.map(par => bmf(par)).reduce(rf) res.collect().head } def generateTypeInformation[K: ClassTag]: TypeInformation[K] = { implicit val ktag = classTag[K] generateTypeInformationFromTag(ktag) } private def generateTypeInformationFromTag[K](tag: ClassTag[K]): TypeInformation[K] = { if (tag.runtimeClass.equals(classOf[Int])) { createTypeInformation[Int].asInstanceOf[TypeInformation[K]] } else if (tag.runtimeClass.equals(classOf[Long])) { createTypeInformation[Long].asInstanceOf[TypeInformation[K]] } else if (tag.runtimeClass.equals(classOf[String])) { createTypeInformation[String].asInstanceOf[TypeInformation[K]] } else { throw new IllegalArgumentException(s"index type $tag is not supported") } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy