bio.ferlab.datalake.spark3.elasticsearch.Indexer.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of datalake-spark3_2.12 Show documentation

Library built on top of Apache Spark to speed-up data lakes development..

There is a newer version: 14.8.0

package bio.ferlab.datalake.spark3.elasticsearch

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.elasticsearch.spark.sql._
import org.slf4j

/**
 *
 * @param spark instantiated spark session
 * {{{
 * val spark: SparkSession = SparkSession.builder
 *.config("es.index.auto.create", "true")
 *.config("es.nodes", "http://es_nodes_url")
 *.config("es.nodes.client.only", "false")
 *.config("es.nodes.discovery", "false")
 *.config("es.nodes.wan.only", "true")
 *.config("es.read.ignore_exception",  "true")
 *.config("es.port", "443")
 *.config("es.wan.only", "true")
 *.config("es.write.ignore_exception", "true")
 *
 *.config("spark.es.nodes.client.only", "false")
 *.config("spark.es.nodes.wan.only", "true")
 *.appName(s"Indexer")
 *.getOrCreate()
 * }}}
 *
 */
@deprecated
class Indexer(jobType: String,
              templateFilePath: String,
              currentIndex: String)
             (implicit val spark: SparkSession) {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)
  val log: slf4j.Logger = slf4j.LoggerFactory.getLogger(getClass.getCanonicalName)

  def run(df: DataFrame)(implicit esClient: ElasticSearchClient): Unit = {
    val ES_config = Map("es.write.operation" -> jobType)

    if (jobType == "index") setupIndex(currentIndex, templateFilePath)

    df.saveToEs(s"$currentIndex/_doc", ES_config)
  }

  def publish(alias: String,
              currentIndex: String,
              previousIndex: Option[String] = None)(implicit esClient: ElasticSearchClient): Unit = {
    esClient.setAlias(add = List(currentIndex), remove = previousIndex.toList, alias)
  }


  /**
   * Setup an index by checking that ES nodes are up, removing the old index and setting the template for this index.
   *
   * @param indexName    full index name
   * @param templatePath path of the template.json that is expected to be in the resource folder or spark
   * @param esClient     an instance of [[ElasticSearchClient]]
   */
  def setupIndex(indexName: String, templatePath: String)(implicit esClient: ElasticSearchClient): Unit = {
    log.info(s"ElasticSearch 'isRunning' status: [${esClient.isRunning}]")
    log.info(s"ElasticSearch 'checkNodes' status: [${esClient.checkNodeRoles}]")

    esClient.deleteIndex(indexName)
    esClient.setTemplate(templatePath)

  }
}