streaming.core.compositor.spark.transformation.ScriptCompositor.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of streamingpro-mlsql-spark_2.4 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package streaming.core.compositor.spark.transformation

import java.util

import net.liftweb.{json => SJSon}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.util.{ScalaSourceCodeCompiler, ScriptCacheKey}

import scala.collection.JavaConversions._

/**
  * 8/2/16 WilliamZhu([email protected])
  */
class ScriptCompositor[T] extends BaseDFCompositor[T] {


  def scripts = {
    _configParams.get(1).map { fieldAndCode =>
      (fieldAndCode._1.toString, fieldAndCode._2 match {
        case a: util.List[String] => a.mkString(" ")
        case a: String => a
        case _ => ""
      })
    }
  }

  def schema = {
    config[String]("schema", _configParams)
  }

  def script = {
    config[String]("script", _configParams)
  }


  val LANGSET = Set("scala", "python")


  def source = {
    config[String]("source", _configParams)
  }

  def lang = {
    config[String]("lang", _configParams)
  }

  def useDocMap = {
    config[Boolean]("useDocMap", _configParams)
  }

  def ignoreOldColumns = {
    config[Boolean]("ignoreOldColumns", _configParams)
  }

  def samplingRatio = {
    config[Double]("samplingRatio", _configParams)
  }


  def createDF(params: util.Map[Any, Any], df: DataFrame) = {
    val _script = script.getOrElse("")
    val _schema = schema.getOrElse("")
    val _source = source.getOrElse("")
    val _useDocMap = useDocMap.getOrElse(false)
    val _ignoreOldColumns = ignoreOldColumns.getOrElse(false)
    val _samplingRatio = samplingRatio.getOrElse(1.0)


    def loadScriptFromFile(script: String) = {
      if ("file" == _source || script.startsWith("file:/") || script.startsWith("hdfs:/")) {
        df.sqlContext.sparkContext.textFile(script).collect().mkString("\n")
      } else if (script.startsWith("classpath:/")) {
        val cleanScriptFilePath = script.substring("classpath://".length)
        scala.io.Source.fromInputStream(
          this.getClass.getResourceAsStream(cleanScriptFilePath)).getLines().
          mkString("\n")
      }
      else script
    }


    val _maps = if (_script.isEmpty) {
      scripts.map { fieldAndCode =>
        (fieldAndCode._1, loadScriptFromFile(fieldAndCode._2))
      }
    } else {
      Map("anykey" -> _script)
    }




    val rawRdd = df.rdd.map { row =>
      val maps = _maps.flatMap { f =>
        val executor = ScalaSourceCodeCompiler.execute(ScriptCacheKey(if (_useDocMap) "doc" else "rawLine", f._2))
        if (_useDocMap) {
          executor.execute(row.getValuesMap(row.schema.fieldNames))
        } else {
          executor.execute(row.getAs[String](f._1))
        }
      }

      val newMaps = if (_ignoreOldColumns) maps
      else row.schema.fieldNames.map(f => (f, row.getAs(f))).toMap ++ maps

      newMaps
    }



    if (schema != None) {

      val schemaScript = loadScriptFromFile(_schema)
      val rowRdd = rawRdd.map { newMaps =>
        val schemaExecutor = ScalaSourceCodeCompiler.execute(ScriptCacheKey("schema", schemaScript))
        val st = schemaExecutor.schema().get
        Row.fromSeq(st.fieldNames.map { fn =>
          if (newMaps.containsKey(fn)) newMaps(fn) else null
        }.toSeq)
      }
      val schemaExecutor = ScalaSourceCodeCompiler.execute(ScriptCacheKey("schema", schemaScript))
      df.sqlContext.createDataFrame(rowRdd, schemaExecutor.schema().get)
    } else {

      val jsonRdd = rawRdd.map { newMaps =>
        implicit val formats = SJSon.Serialization.formats(SJSon.NoTypeHints)
        SJSon.Serialization.write(newMaps)
      }
      df.sqlContext.read.option("samplingRatio", _samplingRatio + "").json(jsonRdd)
    }


  }

}