com.ebiznext.comet.job.Main.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of comet-spark3_2.12 Show documentation
comet-spark3
There is a newer version: 0.2.6
/*
 *
 *  * Licensed to the Apache Software Foundation (ASF) under one or more
 *  * contributor license agreements.  See the NOTICE file distributed with
 *  * this work for additional information regarding copyright ownership.
 *  * The ASF licenses this file to You under the Apache License, Version 2.0
 *  * (the "License"); you may not use this file except in compliance with
 *  * the License.  You may obtain a copy of the License at
 *  *
 *  *    http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS,
 *  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  * See the License for the specific language governing permissions and
 *  * limitations under the License.
 *
 *
 */

package com.ebiznext.comet.job

import buildinfo.BuildInfo
import com.ebiznext.comet.config.{DatasetArea, Settings}
import com.ebiznext.comet.extractor.ScriptGen
import com.ebiznext.comet.job.atlas.AtlasConfig
import com.ebiznext.comet.job.convert.{Parquet2CSV, Parquet2CSVConfig}
import com.ebiznext.comet.job.index.bqload.BigQueryLoadConfig
import com.ebiznext.comet.job.index.esload.ESLoadConfig
import com.ebiznext.comet.job.index.connectionload.ConnectionLoadConfig
import com.ebiznext.comet.job.infer.InferSchemaConfig
import com.ebiznext.comet.job.ingest.LoadConfig
import com.ebiznext.comet.job.metrics.MetricsConfig
import com.ebiznext.comet.schema.generator.Xls2Yml
import com.ebiznext.comet.schema.handlers.SchemaHandler
import com.ebiznext.comet.utils.{CometObjectMapper, FileLock}
import com.ebiznext.comet.workflow.{IngestionWorkflow, TransformConfig, WatchConfig}
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory
import com.typesafe.config.ConfigFactory
import com.typesafe.scalalogging.StrictLogging
import org.apache.hadoop.fs.Path

/** The root of all things.
  *  - importing from landing
  *  - submitting requests to the cron manager
  *  - ingesting the datasets
  *  - running an auto job
  * All these things ared laaunched from here.
  * See printUsage below to understand the CLI syntax.
  */
object Main extends StrictLogging {
  // uses Jackson YAML to parsing, relies on SnakeYAML for low level handling
  val mapper: ObjectMapper = new CometObjectMapper(new YAMLFactory())

  private def printUsage() = {
    // scalastyle:off println
    println(
      s"""
        |Usage : One of
        |comet job jobname
        |comet watch [+/-DOMAIN1,DOMAIN2,...]
        |comet import
        |${LoadConfig.usage()}
        |${ESLoadConfig.usage()}
        |${BigQueryLoadConfig.usage()}
        |${InferSchemaConfig.usage()}
        |${MetricsConfig.usage()}
        |${Parquet2CSVConfig.usage()}
        |      """.stripMargin
    )
    // scalastyle:on println
  }

  /** @param args depends on the action required
    *             to run a job:
    *   - call "comet job jobname" where jobname is the name of the job
    *             as defined in one of the definition files present in the metadata/jobs folder.
    *             to import files from a local file system
    *   - call "comet import", this will move files in the landing area to the pending area
    *             to watch for files wiating to be processed
    *   - call"comet watch [{+|–}domain1,domain2,domain3]" with a optional domain list separated by a ','.
    *             When called without any domain, will watch for all domain folders in the landing area
    *             When called with a '+' sign, will look only for this domain folders in the landing area
    *             When called with a '-' sign, will look for all domain folder in the landing area except the ones in the command lines.
    *   - call "comet ingest domain schema hdfs://datasets/domain/pending/file.dsv"
    *             to ingest a file defined by its schema in the specified domain
    *   -call "comet infer-schema --domain domainName --schema schemaName --input datasetpath --output outputPath --with-header boolean
    *   - call "comet metrics --domain domain-name --schema schema-name "
    *             to compute all metrics on specific schema in a specific domain
    */
  def main(args: Array[String]): Unit = {
    implicit val settings: Settings = Settings(ConfigFactory.load())
    settings.publishMDCData()
    logger.info(s"Comet Version ${BuildInfo.version}")
    import settings.{launcherService, metadataStorageHandler, storageHandler}
    DatasetArea.initMetadata(metadataStorageHandler)
    val schemaHandler = new SchemaHandler(metadataStorageHandler)

    DatasetArea.initDomains(storageHandler, schemaHandler.domains.map(_.name))

    val workflow =
      new IngestionWorkflow(storageHandler, schemaHandler, launcherService)

    if (args.length == 0) printUsage()

    val arglist = args.toList
    logger.info(s"Running Comet $arglist")
    val result = arglist.head match {
      case "job" | "transform" =>
        TransformConfig.parse(args.drop(1)) match {
          case Some(config) =>
            // do something
            workflow.autoJob(config)
          case _ =>
            println(TransformConfig.usage())
            false
        }
      case "import" =>
        workflow.loadLanding()
        true
      case "watch" =>
        WatchConfig.parse(args.drop(1)) match {
          case Some(config) =>
            workflow.loadPending(config)
          case _ =>
            println(WatchConfig.usage())
            false
        }
      case "ingest" | "load" =>
        LoadConfig.parse(args.drop(1)) match {
          case Some(config) =>
            val lockPath =
              new Path(settings.comet.lock.path, s"${config.domain}_${config.schema}.lock")
            val locker = new FileLock(lockPath, storageHandler)
            val waitTimeMillis = settings.comet.lock.ingestionTimeout

            locker.doExclusively(waitTimeMillis) {
              workflow.ingest(config)
            }

          case _ =>
            println(LoadConfig.usage())
            false
        }
      case "index" | "esload" =>
        ESLoadConfig.parse(args.drop(1)) match {
          case Some(config) =>
            // do something
            workflow.esLoad(config).isSuccess
          case _ =>
            println(ESLoadConfig.usage())
            false
        }

      case "atlas" =>
        AtlasConfig.parse(args.drop(1)) match {
          case Some(config) =>
            // do something
            workflow.atlas(config)
          case _ =>
            println(AtlasConfig.usage())
            false
        }

      case "bqload" =>
        BigQueryLoadConfig.parse(args.drop(1)) match {
          case Some(config) =>
            workflow.bqload(config).isSuccess
          case _ =>
            println(BigQueryLoadConfig.usage())
            false
        }

      case "cnxload" =>
        ConnectionLoadConfig.parse(args.drop(1)) match {
          case Some(config) =>
            workflow.jdbcload(config).isSuccess
          case _ =>
            println(ConnectionLoadConfig.usage())
            false
        }

      case "infer-schema" => {
        InferSchemaConfig.parse(args.drop(1)) match {
          case Some(config) =>
            workflow.infer(config).isSuccess
          case _ =>
            println(InferSchemaConfig.usage())
            false
        }
      }
      case "metrics" =>
        MetricsConfig.parse(args.drop(1)) match {
          case Some(config) =>
            workflow.metric(config).isSuccess
          case _ =>
            println(MetricsConfig.usage())
            false
        }

      case "parquet2csv" =>
        Parquet2CSVConfig.parse(args.drop(1)) match {
          case Some(config) =>
            new Parquet2CSV(config, storageHandler).run().isSuccess
          case _ =>
            println(Parquet2CSVConfig.usage())
            false
        }

      case "xls2yml" =>
        Xls2Yml.run(args.drop(1))

      case "extract" =>
        ScriptGen.run(args.drop(1))
      case _ =>
        printUsage()
        false
    }
    System.exit(if (result) 0 else 1)
  }
}