com.ebiznext.comet.job.index.esload.ESLoadJob.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of comet-spark3_2.12 Show documentation
comet-spark3
There is a newer version: 0.2.6
/*
 *
 *  * Licensed to the Apache Software Foundation (ASF) under one or more
 *  * contributor license agreements.  See the NOTICE file distributed with
 *  * this work for additional information regarding copyright ownership.
 *  * The ASF licenses this file to You under the Apache License, Version 2.0
 *  * (the "License"); you may not use this file except in compliance with
 *  * the License.  You may obtain a copy of the License at
 *  *
 *  *    http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS,
 *  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  * See the License for the specific language governing permissions and
 *  * limitations under the License.
 *
 *
 */

package com.ebiznext.comet.job.index.esload

import com.ebiznext.comet.config.Settings
import com.ebiznext.comet.schema.handlers.{SchemaHandler, StorageHandler}
import com.ebiznext.comet.schema.model.Schema
import com.ebiznext.comet.utils.{JobResult, SparkJob, SparkJobResult}
import com.softwaremill.sttp._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.types.StructField

import scala.util.{Failure, Success, Try}

class ESLoadJob(
  cliConfig: ESLoadConfig,
  storageHandler: StorageHandler,
  schemaHandler: SchemaHandler
)(implicit val settings: Settings)
    extends SparkJob {

  val esresource = Some(("es.resource.write", s"${cliConfig.getResource()}"))
  val esId = cliConfig.id.map("es.mapping.id" -> _)
  val esCliConf = cliConfig.conf ++ List(esresource, esId).flatten.toMap
  val path = cliConfig.getDataset()
  val format = cliConfig.format
  val dataset = cliConfig.dataset

  override def name: String = s"Index $path"

  /** Just to force any spark job to implement its entry point within the "run" method
    *
    * @return : Spark Session used for the job
    */
  override def run(): Try[JobResult] = {
    logger.info(s"Indexing resource ${cliConfig.getResource()} with $cliConfig")
    val inputDF =
      path match {
        case Left(path) =>
          format match {
            case "json" =>
              session.read
                .option("multiline", true)
                .json(path.toString)

            case "json-array" =>
              val jsonDS = session.read.textFile(path.toString)
              session.read.json(jsonDS)

            case "parquet" =>
              session.read.parquet(path.toString)
          }
        case Right(df) =>
          df
      }

    // Convert timestamp field to ISO8601 date time, so that ES Hadoop can handle it correctly.
    val df = cliConfig.getTimestampCol().map { tsCol =>
      import org.apache.spark.sql.functions._
      inputDF
        .withColumn("comet_es_tmp", date_format(col(tsCol), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
        .drop(tsCol)
        .withColumnRenamed("comet_es_tmp", tsCol)
    } getOrElse inputDF

    val content = cliConfig.mapping.map(storageHandler.read).getOrElse {
      val dynamicTemplate = for {
        domain <- schemaHandler.getDomain(cliConfig.domain)
        schema <- domain.schemas.find(_.name == cliConfig.schema)
      } yield schema.mapping(domain.mapping(schema), domain.name, schemaHandler)

      dynamicTemplate.getOrElse {
        // Handle datasets without YAML schema
        // We handle only index name like idx-{...}
        Schema.mapping(
          cliConfig.domain,
          cliConfig.schema,
          StructField("ignore", df.schema),
          schemaHandler
        )
      }
    }
    logger.info(
      s"Registering template ${cliConfig.domain.toLowerCase}_${cliConfig.schema.toLowerCase} -> $content"
    )
    import scala.collection.JavaConverters._
    val esOptions = settings.comet.elasticsearch.options.asScala.toMap
    val host: String = esOptions.getOrElse("es.nodes", "localhost")
    val port = esOptions.getOrElse("es.port", "9200").toInt
    val ssl = esOptions.getOrElse("es.net.ssl", "false").toBoolean
    val protocol = if (ssl) "https" else "http"
    val username = esOptions.get("net.http.auth.user")
    val password = esOptions.get("net.http.auth.password")

    implicit val backend = HttpURLConnectionBackend()
    val authSttp = for {
      u <- username
      p <- password
    } yield {
      sttp.auth.basic(u, p)
    }

    val templateUri =
      uri"$protocol://$host:$port/_template/${cliConfig.getIndexName()}"
    val requestDel = authSttp
      .getOrElse(sttp)
      .delete(templateUri)
      .contentType("application/json")
    val _ = requestDel.send()

    val requestPut = authSttp
      .getOrElse(sttp)
      .body(content)
      .put(templateUri)
      .contentType("application/json")

    val responsePut = requestPut.send()
    val ok = (200 to 299) contains responsePut.code
    if (ok) {
      val allConf = esOptions.toList ++ esCliConf.toList
      logger.whenDebugEnabled {
        logger.debug(s"sending ${df.count()} documents to Elasticsearch using $allConf")
      }
      val writer = allConf
        .foldLeft(df.write)((w, kv) => w.option(kv._1, kv._2))
        .format("org.elasticsearch.spark.sql")
        .mode(SaveMode.Overwrite)
      if (settings.comet.isElasticsearchSupported())
        writer.save(cliConfig.getResource())
      Success(SparkJobResult(None))
    } else {
      Failure(throw new Exception("Failed to create template"))
    }
  }
}