io.github.seabow.DataX.scala Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of spark-datax-core Show documentation

There is a newer version: 0.4.0

package io.github.seabow

import io.github.seabow.datax.common.{FutureUtils, ParameterUtils, SparkUtils}
import io.github.seabow.datax.core.Job
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.internal.Logging
import org.apache.spark.sql.SparkSession

import scala.io.Source

object DataX extends Logging {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .set("spark.port.maxRetries", "30")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .set("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2")
      .set("spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored", "true")
      .set("spark.sql.sources.partitionOverwriteMode", "dynamic")
      .set("spark.sql.broadcastTimeout", "3000")
      .set("spark.cleaner.referenceTracking.cleanCheckpoints", "true")
//
    if(!conf.contains("spark.cleaner.periodicGC.interval")){
      log.warn("spark.cleaner.periodicGC.interval is not set, use 3min as default.")
      conf.set("spark.cleaner.periodicGC.interval", "3min")
    }
    //获取参数
    val params = ParameterUtils.getParameters(args)
    //默认位置和获取的参数。
    val configPath= params.get("config_path")
    if(!configPath.isDefined){
      throw new IllegalArgumentException("No config_path specified")
    }

    val spark: SparkSession = SparkSession
      .builder()
      .config(conf)
      .enableHiveSupport()
      .getOrCreate()
    org.apache.log4j.LogManager.getRootLogger.setLevel(Level.WARN)
    val appId = spark.sparkContext.applicationId
    val uiWebUrl = spark.sparkContext.uiWebUrl
    log.warn(s"application id : $appId")
    log.warn(s"app web url : $uiWebUrl")
    val configContent=spark.sparkContext.deployMode match {
      case "client"=>  Source.fromFile(configPath.get,"utf8").mkString
      case  _=>  SparkUtils.getFileContent(configPath.get)
    }
    val job=Job(configContent,params,spark)
    try{
      job.execute()
    }finally{
      job.close()
    }
  }
}