streaming.dsl.mmlib.algs.SQLImageLoaderExt.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of streamingpro-mlsql-spark_2.4 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package streaming.dsl.mmlib.algs

import java.util.UUID

import com.intel.analytics.bigdl.dataset.Sample
import com.intel.analytics.bigdl.transform.vision.image._
import com.intel.analytics.bigdl.utils.Engine
import org.apache.spark.ml.param.{IntParam, Param}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.mlsql.session.MLSQLException
import streaming.common.{ScriptCacheKey, SourceCodeCompiler}
import streaming.dsl.ScriptSQLExec
import streaming.dsl.mmlib._
import streaming.dsl.mmlib.algs.param.BaseParams
import streaming.log.{Logging, WowLog}


class SQLImageLoaderExt(override val uid: String) extends SQLAlg with BaseParams with Logging with WowLog {

  def this() = this(BaseParams.randomUID())

  override def train(df: DataFrame, path: String, params: Map[String, String]): DataFrame = {
    Engine.init
    params.get(imageDir.name).
      map(m => set(imageDir, m)).getOrElse {
      set(imageDir, path)
      require($(imageDir) != null, "imageDir should not empty")
    }

    params.get(numOfImageTasks.name).map(m => set(numOfImageTasks, m.toInt)).getOrElse {
      set(numOfImageTasks, 2)
    }

    val c = ScriptSQLExec.contextGetOrForTest()
    val trans = params.get(code.name).map(m => set(code, m)) match {
      case Some(_) =>
        val wrap = (fn: () => Any) => {
          try {
            ScriptSQLExec.setContextIfNotPresent(c)
            fn()
          } catch {
            case e: Exception =>
              logError(format_exception(e))
              throw e
          }
        }

        val wrapClass = (function: String) => {
          val className = s"StreamingProUDF_${UUID.randomUUID().toString.replaceAll("-", "")}"
          val newfun =
            s"""
               |class  ${className}{
               |import com.intel.analytics.bigdl.transform.vision.image._
               |import com.intel.analytics.bigdl.transform.vision.image.augmentation._
               |import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
               |  ${function}
               |}
            """.stripMargin
          (className, newfun)
        }
        val (className, newfun) = wrapClass($(code))

        val clazz = wrap(() => {
          SourceCodeCompiler.execute(ScriptCacheKey(newfun, className))
        }).asInstanceOf[Class[_]]

        val method = SourceCodeCompiler.getMethod(clazz, "apply")
        Option(method.invoke(clazz.newInstance(), params).asInstanceOf[FeatureTransformer])

      case None => None
    }

    val distributedImageFrame = ImageFrame.read(path, df.sparkSession.sparkContext, $(numOfImageTasks))
    val imageFrame = trans.map(tr => tr(distributedImageFrame)).getOrElse(distributedImageFrame)

    val imageRDD = imageFrame.toDistributed().rdd.map { im =>
      (im.uri, im[Sample[Float]](ImageFeature.sample).getData())
    }
    val imageDF = df.sparkSession.createDataFrame(imageRDD)
      .withColumnRenamed("_1", "imageName")
      .withColumnRenamed("_2", "features")
    imageDF
  }


  override def batchPredict(df: DataFrame, path: String, params: Map[String, String]): DataFrame = {
    train(df, path, params)
  }


  override def explainParams(sparkSession: SparkSession): DataFrame = {
    _explainParams(sparkSession)
  }

  override def modelType: ModelType = ProcessType


  override def doc: Doc = Doc(MarkDownDoc,
    """
      |ImageLoaderExt module is used to process images.
      |
      |Check available params:
      |
      |```sql
      |load modelParams.`ImageLoaderExt` as output;
      |```
      |
      |Check example:
      |
      |```
      |load modelExample.`ImageLoaderExt` as output;
      |```
      |
      |The `code` param is used to configure image processing pipeline.
      |MLSQL provide a DSL which is supported by BigDL.
      |
      |For example, if you define a processing pipeline like this(check example)
      |
      |```
      |code='''
      |        def apply(params:Map[String,String]) = {
      |         Resize(256, 256) -> CenterCrop(224, 224) ->
      |          MatToTensor() -> ImageFrameToSample()
      |       }
      |```
      |
      |The parameter of params in apply function is contains all expressions in where/options statement.
      |
      |```
      |Resize(256, 256) -> CenterCrop(224, 224) ->
      |          MatToTensor() -> ImageFrameToSample()
      |```
      |
      |This means first step, resize the image to 256*256 ,and then crop the image in center ,change the image
      |to tensor, finally convert to ImageFrame which is a collection of sample.
      |
      |ImageFrame is a data collections  contains two columns,they are `imageName` and `features`.
      |The imageName is the url of image.
      |
      |More details about FeatureTransformer something like Resize,CenterCrop please check
      |[URL](https://github.com/intel-analytics/BigDL/blob/master/docs/docs/APIGuide/Transformer.md)
    """.stripMargin)

  override def codeExample: Code = Code(SQLCode,
    """
      |set json='''{}''';
      |load jsonStr.`json` as emptyData;
      |
      |set imageDir="/Users/allwefantasy/Downloads/jack";
      |
      |run emptyData as ImageLoaderExt.`${imageDir}`
      |where code='''
      |        def apply(params:Map[String,String]) = {
      |         Resize(256, 256) -> CenterCrop(224, 224) ->
      |          MatToTensor() -> ImageFrameToSample()
      |       }
      |''' as images;
      |select imageName from images limit 1 as output;
    """.stripMargin)

  override def load(sparkSession: SparkSession, path: String, params: Map[String, String]): Any = {
    throw new MLSQLException(s"register is not support in ${getClass.getName}")
  }

  override def predict(sparkSession: SparkSession, _model: Any, name: String, params: Map[String, String]): UserDefinedFunction = {
    null
  }

  final val imageDir: Param[String] = new Param[String](this, "imageDir", "imageDir directory")
  final val numOfImageTasks: IntParam = new IntParam(this, "numOfImageTasks", "how many")
  final val code: Param[String] = new Param[String](this, "code", "code")

}