All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ebiznext.comet.config.DatasetArea.scala Maven / Gradle / Ivy

There is a newer version: 0.2.6
Show newest version
/*
 *
 *  * Licensed to the Apache Software Foundation (ASF) under one or more
 *  * contributor license agreements.  See the NOTICE file distributed with
 *  * this work for additional information regarding copyright ownership.
 *  * The ASF licenses this file to You under the Apache License, Version 2.0
 *  * (the "License"); you may not use this file except in compliance with
 *  * the License.  You may obtain a copy of the License at
 *  *
 *  *    http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS,
 *  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  * See the License for the specific language governing permissions and
 *  * limitations under the License.
 *
 *
 */

package com.ebiznext.comet.config

import java.util.Locale

import com.ebiznext.comet.schema.handlers.StorageHandler
import com.fasterxml.jackson.core.{JsonGenerator, JsonParser}
import com.fasterxml.jackson.databind.annotation.{JsonDeserialize, JsonSerialize}
import com.fasterxml.jackson.databind.{
  DeserializationContext,
  JsonDeserializer,
  JsonSerializer,
  SerializerProvider
}
import org.apache.hadoop.fs.Path

/** Utilities methods to reference datasets paths
  * Datasets paths are constructed as follows :
  *   - root path defined by the COMET_DATASETS env var or datasets applciation property
  *   - followed by the area name
  *   - followed by the the domain name
  */
object DatasetArea {

  def path(domain: String, area: String)(implicit settings: Settings) =
    new Path(s"${settings.comet.datasets}/$area/$domain")

  def path(domainPath: Path, schema: String) = new Path(domainPath, schema)

  /** datasets waiting to be ingested are stored here
    *
    * @param domain : Domain Name
    * @return Absolute path to the pending folder of domain
    */
  def pending(domain: String)(implicit settings: Settings): Path =
    path(domain, settings.comet.area.pending)

  /** datasets with a file name that could not match any schema file name pattern in the specified domain
    * are marked unresolved by being stored in this folder.
    *
    * @param domain : Domain name
    * @return Absolute path to the pending unresolved folder of domain
    */
  def unresolved(domain: String)(implicit settings: Settings): Path =
    path(domain, settings.comet.area.unresolved)

  /** Once ingested datasets are archived in this folder.
    *
    * @param domain : Domain name
    * @return Absolute path to the archive folder of domain
    */
  def archive(domain: String)(implicit settings: Settings): Path =
    path(domain, settings.comet.area.archive)

  /** Datasets of the specified domain currently being ingested are located in this folder
    *
    * @param domain : Domain name
    * @return Absolute path to the ingesting folder of domain
    */
  def ingesting(domain: String)(implicit settings: Settings): Path =
    path(domain, settings.comet.area.ingesting)

  /** Valid records for datasets the specified domain are stored in this folder.
    *
    * @param domain : Domain name
    * @return Absolute path to the ingesting folder of domain
    */
  def accepted(domain: String)(implicit settings: Settings): Path =
    path(domain, settings.comet.area.accepted)

  /** Invalid records and the reason why they have been rejected for the datasets of the specified domain are stored in this folder.
    *
    * @param domain : Domain name
    * @return Absolute path to the rejected folder of domain
    */
  def rejected(domain: String)(implicit settings: Settings): Path =
    path(domain, settings.comet.area.rejected)

  def metrics(domain: String, schema: String)(implicit settings: Settings): Path = {
    val path = settings.comet.metrics.path
    new Path(
      path
        .replace("{domain}", domain)
        .replace("{schema}", schema)
    )
  }

  def discreteMetrics(domain: String, schema: String)(implicit settings: Settings): Path = {
    new Path(metrics(domain, schema), "discrete")
  }

  def continuousMetrics(domain: String, schema: String)(implicit settings: Settings): Path = {
    new Path(metrics(domain, schema), "continuous")
  }

  def frequenciesMetrics(domain: String, schema: String)(implicit settings: Settings): Path = {
    new Path(metrics(domain, schema), "frequencies")
  }

  /** Default target folder for autojobs applied to datasets in this domain
    *
    * @param domain : Domain name
    * @return Absolute path to the business folder of domain
    */
  def business(domain: String)(implicit settings: Settings): Path =
    path(domain, settings.comet.area.business)

  def metadata(implicit settings: Settings): Path =
    new Path(s"${settings.comet.metadata}")

  def types(implicit settings: Settings): Path =
    new Path(metadata, "types")

  def mapping(implicit settings: Settings): Path =
    new Path(metadata, "mapping")

  def domains(implicit settings: Settings): Path =
    new Path(metadata, "domains")

  def jobs(implicit settings: Settings): Path =
    new Path(metadata, "jobs")

  /** @param storage
    */
  def initMetadata(
    storage: StorageHandler
  )(implicit settings: Settings): Unit = {
    List(metadata, types, domains).foreach(storage.mkdirs)
  }

  def initDomains(storage: StorageHandler, domains: Iterable[String])(implicit
    settings: Settings
  ): Unit = {
    domains.foreach { domain =>
      List(pending _, unresolved _, archive _, accepted _, rejected _, business _)
        .map(_(domain))
        .foreach(storage.mkdirs)
    }
  }
}

/** After going through the data pipeline
  * a dataset may be referenced through a Hive table in a Hive Database.
  * For each input domain, 3 Hive databases may be created :
  *     - The rejected database : contains tables referencing rejected records for each schema in the domain
  *     - The accepted database : contains tables referencing
  *     - The business database : contains tables where autjob tables are created by default
  *     - The ciustom database : contains tables where autojob tables are created when a specific area is defined
  */
object StorageArea {

  def fromString(value: String)(implicit settings: Settings): StorageArea = {

    val lcValue = value.toLowerCase(Locale.ROOT)

    lcValue match {
      case _ if lcValue == settings.comet.area.rejectedFinal => StorageArea.rejected
      case _ if lcValue == settings.comet.area.acceptedFinal => StorageArea.accepted
      case _ if lcValue == settings.comet.area.businessFinal => StorageArea.business
      case custom                                            => StorageArea.Custom(custom)
    }
  }

  case object rejected extends StorageArea {
    def value: String = "rejected"
  }

  case object accepted extends StorageArea {
    def value: String = "accepted"
  }

  case object business extends StorageArea {
    def value: String = "business"
  }

  final case class Custom(value: String) extends StorageArea

  def area(domain: String, area: StorageArea): String = s"${domain}_${area.value}"

}

final class StorageAreaSerializer extends JsonSerializer[StorageArea] {

  override def serialize(
    value: StorageArea,
    gen: JsonGenerator,
    serializers: SerializerProvider
  ): Unit = {
    val settings = serializers.getAttribute(classOf[Settings]).asInstanceOf[Settings]
    require(settings != null, "the SerializationContext lacks a Settings instance")

    val strValue = value match {
      case StorageArea.accepted            => settings.comet.area.accepted
      case StorageArea.rejected            => settings.comet.area.rejected
      case StorageArea.business            => settings.comet.area.business
      case StorageArea.Custom(customValue) => customValue
    }

    gen.writeString(strValue)
  }
}

final class StorageAreaDeserializer extends JsonDeserializer[StorageArea] {

  override def deserialize(jp: JsonParser, ctx: DeserializationContext): StorageArea = {
    val settings = ctx
      .findInjectableValue("com.ebiznext.comet.config.Settings", null, null)
      .asInstanceOf[Settings]
    require(settings != null, "the DeserializationContext lacks a Settings instance")

    val value = jp.readValueAs[String](classOf[String])
    StorageArea.fromString(value)(settings)
  }
}

@JsonSerialize(using = classOf[StorageAreaSerializer])
@JsonDeserialize(using = classOf[StorageAreaDeserializer])
sealed abstract class StorageArea {
  def value: String
  override def toString: String = value
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy