All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mnubo.dbevolv.ElasticsearchDatabase.scala Maven / Gradle / Ivy

The newest version!
package com.mnubo.dbevolv

import java.text.SimpleDateFormat
import java.util.Date

import com.mnubo.dbevolv.util.Logging
import com.mnubo.dbevolv.docker.{Container, Docker}
import com.typesafe.config.Config
import org.elasticsearch.action.admin.cluster.health.ClusterHealthStatus
import org.elasticsearch.client.Client
import org.elasticsearch.client.transport.TransportClient
import org.elasticsearch.common.settings.ImmutableSettings
import org.elasticsearch.common.transport.InetSocketTransportAddress
import org.elasticsearch.index.query.QueryBuilders
import org.joda.time.{DateTime, DateTimeZone}
import spray.json._

import scala.collection.JavaConverters._
import scala.util.Try

object ElasticsearchDatabase extends Database {
  val name = "elasticsearch"
  private val isStartedRegex = """recovered \[\d+\] indices into cluster_state""".r

  override def openConnection(docker: Docker,
                              schemaName: String,
                              hosts: String,
                              port: Int,
                              userName: String,
                              pwd: String,
                              config: Config): DatabaseConnection =
    new ElasticsearchConnection(
      docker,
      schemaName,
      hosts,
      if (port > 0) port else 9300,
      config)

  override val testDockerBaseImage =
    DatabaseDockerImage(
      name              = "mnubo/elasticsearch:1.5.2",
      exposedPort       = 9300,
      isStarted         = isStarted
    )

  private [dbevolv] def newClient(hosts: String, port: Int) = {
    val settings =
      ImmutableSettings
        .builder()
        .put("client.transport.ignore_cluster_name", true)
        .classLoader(getClass.getClassLoader)
        .build()

    val addresses =
      hosts
        .split(",")
        .map(new InetSocketTransportAddress(_, port))

    new TransportClient(settings).addTransportAddresses(addresses: _*)
  }

  private def isStarted(log: String, container: Container) =
    isStartedRegex.findFirstIn(log).isDefined &&
    Try(using(newClient(container.containerHost, container.exposedPort)) { tempClient =>
      tempClient
        .admin()
        .cluster()
        .prepareHealth()
        .get
        .getStatus == ClusterHealthStatus.GREEN
    }).toOption.getOrElse(false)
}

class ElasticsearchConnection(docker: Docker,
                              computedDbName: String,
                              hosts: String,
                              port: Int,
                              config: Config) extends DatabaseConnection with Logging {
  private val client = ElasticsearchDatabase.newClient(hosts, port)
  private val forcePullVerificationDb = config.getBoolean("force_pull_verification_db")
  private val dockerNamespace = if (config.hasPath("docker_namespace")) Some(config.getString("docker_namespace")) else None
  private val schemaExtractorClass = if(config.hasPath("schema_extractor_class")) config.getString("schema_extractor_class") else classOf[DefaultElasticsearchSchemaExtractor].getName
  private val schemaExtractor = getClass
    .getClassLoader
    .loadClass(schemaExtractorClass)
    .getConstructor(classOf[Config])
    .newInstance(config)
    .asInstanceOf[ElasticsearchSchemaExtractor]

  private val schemaName: String = config.getString("schema_name")
  private val versionTypeName = s"${schemaName}_version"

  private val df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZZ")

  private var indexName: String = null

  override def setActiveSchema(indexName: String, config: Config) {
    this.indexName = indexName
    if (!indexExists) createIndex(config)
  }

  override def execute(smt: String): Unit =
    throw new Exception("The Elasticsearch database does not support SQL statements, just @@package.class scripts.")

  override def innerConnection: AnyRef =
    client

  /** For tests, or QA, we might want to recreate a database instance from scratch. Implementors should know how to properly clean an existing database. */
  override def dropDatabase(config: Config) = {
    if (!client.admin.indices().prepareDelete(indexName).get.isAcknowledged)
      throw new Exception(s"Cannot delete index $indexName")

    createIndex(config)
  }

  override def getInstalledMigrationVersions: Set[InstalledVersion] = {
    ensureVersionTypeExists()

    client
      .prepareSearch(indexName)
      .setTypes(versionTypeName)
      .setQuery(QueryBuilders.matchAllQuery())
      .setSize(10000)
      .execute()
      .actionGet()
      .getHits
      .getHits
      .map(doc => InstalledVersion(
        doc.getId,
        DateTime.parse(doc.getSource.get("migration_date").asInstanceOf[String]).withZone(DateTimeZone.UTC),
        doc.getSource.get("checksum").asInstanceOf[String]
      ))
      .toSet
  }

  override def markMigrationAsInstalled(migrationVersion: String, checksum: String, isRebase: Boolean) = {
    if (isRebase)
      client
        .prepareDeleteByQuery(indexName)
        .setTypes(versionTypeName)
        .setQuery(QueryBuilders.matchAllQuery())
        .get

    if (!client
      .prepareIndex(indexName, versionTypeName, migrationVersion)
      .setSource(
        "migration_date", df.format(new Date()).asInstanceOf[Any],
        "checksum", checksum)
      .get
      .isCreated)
      throw new Exception(s"Cannot mark migration $migrationVersion as installed")

    client
      .admin
      .indices
      .prepareFlush(indexName)
      .setForce(true)
      .setWaitIfOngoing(true)
      .get
  }

  override def markMigrationAsUninstalled(migrationVersion: String) = {
    if (!client
      .prepareDelete(indexName, versionTypeName, migrationVersion)
      .execute
      .get
      .isFound)
      throw new Exception(s"Cannot mark migration $migrationVersion as uninstalled")

    client
      .admin
      .indices
      .prepareFlush(indexName)
      .setForce(true)
      .setWaitIfOngoing(true)
      .get
  }

  override def updateChecksum(migrationVersion: String, newChecksum: String) = {
    client
      .prepareUpdate(indexName, versionTypeName, migrationVersion)
      .setDoc("checksum", newChecksum)
      .get()

    client
      .admin
      .indices
      .prepareFlush(indexName)
      .setForce(true)
      .setWaitIfOngoing(true)
      .get

    val updatedChecksum = getInstalledMigrationVersions.filter(_.version == migrationVersion).head.checksum
    if (updatedChecksum != newChecksum)
      throw new Exception(s"Checksum of migration $migrationVersion hasn't been updated. $updatedChecksum != $newChecksum")
  }

  override def close() =
    Try(client.close())

  private def ensureVersionTypeExists() =
    if (!versionTypeExists) {
      if (!client
        .admin
        .indices
        .preparePutMapping(indexName)
        .setType(versionTypeName)
        .setSource(
          "migration_date", "type=date,store=true,format=date_time",
          "checksum",        "type=string,index=not_analyzed"
        )
        .get
        .isAcknowledged)
        throw new Exception(s"Cannot add mappings for version table in $indexName index.")
    }

  private def versionTypeExists =
    client
      .admin
      .indices
      .prepareTypesExists(indexName)
      .setTypes(versionTypeName)
      .get
      .isExists

  private def createIndex(config: Config) = {
    if (!client
      .admin
      .indices
      .prepareCreate(indexName)
      .setSettings(
        "number_of_shards", config.getString("shard_number"),
        "number_of_replicas", config.getString("replica_number")
      )
      .get
      .isAcknowledged)
      throw new Exception(s"Cannot create admin index $indexName.")

    ensureVersionTypeExists()
  }

  private def indexExists =
    client
      .admin
      .indices
      .prepareExists(indexName)
      .get
      .isExists

  override def isSchemaValid: Boolean = {
    val installed = getInstalledMigrationVersions.map(_.version).toSeq.sorted

    if (installed.isEmpty)
      true
    else {
      val currentVersion = installed.last

      val referenceDatabase = new Container(
        docker,
        ElasticsearchDatabase.testDockerImageName(dockerNamespace, computedDbName, currentVersion),
        ElasticsearchDatabase.testDockerBaseImage.isStarted,
        ElasticsearchDatabase.testDockerBaseImage.exposedPort,
        forcePull = forcePullVerificationDb
      )

      try {
        using(new ElasticsearchConnection(docker, computedDbName, referenceDatabase.containerHost, referenceDatabase.exposedPort, config)) { referenceDatabaseConnection =>
          referenceDatabaseConnection.setActiveSchema(schemaName, config)
          isSameSchema(referenceDatabaseConnection)
        }
      }
      finally {
        Try(referenceDatabase.stop())
        Try(referenceDatabase.remove())
      }
    }
  }

  override def isSameSchema(other: DatabaseConnection) : Boolean = {
    other match {
      case otherConn:ElasticsearchConnection =>
        val mySchema = schema()
        val otherSchema = otherConn.schema()
        otherSchema.isCompatibleWith(mySchema)
      case _ => false
    }
  }

  private def schema(): Schema[Map[String, String]] = schemaExtractor.extractSchema(this, indexName)
}

trait ElasticsearchSchemaExtractor extends SchemaExtractor[Map[String, String]]

class DefaultElasticsearchSchemaExtractor(config: Config) extends ElasticsearchSchemaExtractor with Logging {
  // Ex of mapping for an index with one type "event":
  // {
  //     "event": {
  //         "properties": {
  //             "_all": {"enabled": "false" },
  //             "x_event_type": {"type": "string", "index": "not_analyzed"},
  //             "x_pipeline": {"type": "string", "index": "not_analyzed"},
  //             "x_timestamp": {"type": "date", "format": "date_time"},
  //             "x_object": {
  //                 "type": "nested",
  //                 "properties": {
  //                     "object_id": {"type": "string", "index": "not_analyzed"},
  //                     "x_registration_latlon": {"type": "geo_point"}
  //                 }
  //             }
  //         }
  //     }
  // }

  override def extractSchema(connection: DatabaseConnection, index: String): Schema[Map[String, String]] = {
    val client = connection.innerConnection.asInstanceOf[Client]
    val response = client
      .admin
      .indices
      .prepareGetMappings(index)
      .get

    Schema(
      response
        .mappings.asScala
        .head // Only one index
        .value
        .asScala
        .map { cursor =>
          val (typeName, typeMappings) = (cursor.key, cursor.value)
          Table(
            typeName,
            parseMappingProperties(
              typeMappings
                .source().string()
                .parseJson
                .asJsObject
                .fields(typeName)
                .asJsObject
            )
          )
        }
    )
  }

  // This will return all metadata associated with each property. Nested properties will be parsed as well, and their name will be their dotted path (ex: x_object.x_owner.username).
  // Non string metadata is ignored for simplicity.
  // Ex of JSON mapping that should be passed:
  // {
  //     "properties": {
  //         "_all": {"enabled": "false" },
  //         "x_event_type": {"type": "string", "index": "not_analyzed"},
  //         "x_pipeline": {"type": "string", "index": "not_analyzed"},
  //         "x_timestamp": {"type": "date", "format": "date_time"},
  //         "x_object": {
  //             "type": "nested",
  //             "properties": {
  //                 "object_id": {"type": "string", "index": "not_analyzed"},
  //                 "x_registration_latlon": {"type": "geo_point"}
  //             }
  //         }
  //     }
  // }
  private def parseMappingProperties(mapping: JsObject, prefix: String = ""): Set[Column[Map[String, String]]] =
    mapping
      .fields("properties")
      .asJsObject
      .fields
      .toSet[(String, JsValue)]
      .flatMap { case (name, v) =>
        val property = v.asJsObject
        val typ = property
          .fields("type")
          .asInstanceOf[JsString]
          .value

        log.debug(s"Mapping parser: found a $name property of type $typ.")
        if (typ == "nested")
          parseMappingProperties(property, prefix + name + ".")
        else {
          log.debug(s"Mapping parser: returning a $prefix$name property of type $typ.")
          Set(
            Column(
              prefix + name,
              property
                .fields
                .filter(_._2.isInstanceOf[JsString])
                .mapValues(_.asInstanceOf[JsString].value)
            )
          )
        }
      }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy