io.github.seabow.DataX.scala Maven / Gradle / Ivy
package io.github.seabow
import io.github.seabow.datax.common.{FutureUtils, ParameterUtils, SparkUtils}
import io.github.seabow.datax.core.Job
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.internal.Logging
import org.apache.spark.sql.SparkSession
import scala.io.Source
object DataX extends Logging {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
.set("spark.port.maxRetries", "30")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.set("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2")
.set("spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored", "true")
.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
.set("spark.sql.broadcastTimeout", "3000")
.set("spark.cleaner.referenceTracking.cleanCheckpoints", "true")
//
if(!conf.contains("spark.cleaner.periodicGC.interval")){
log.warn("spark.cleaner.periodicGC.interval is not set, use 3min as default.")
conf.set("spark.cleaner.periodicGC.interval", "3min")
}
//获取参数
val params = ParameterUtils.getParameters(args)
//默认位置和获取的参数。
val configPath= params.get("config_path")
if(!configPath.isDefined){
throw new IllegalArgumentException("No config_path specified")
}
val spark: SparkSession = SparkSession
.builder()
.config(conf)
.enableHiveSupport()
.getOrCreate()
org.apache.log4j.LogManager.getRootLogger.setLevel(Level.WARN)
val appId = spark.sparkContext.applicationId
val uiWebUrl = spark.sparkContext.uiWebUrl
log.warn(s"application id : $appId")
log.warn(s"app web url : $uiWebUrl")
val configContent=spark.sparkContext.deployMode match {
case "client"=> Source.fromFile(configPath.get,"utf8").mkString
case _=> SparkUtils.getFileContent(configPath.get)
}
val job=Job(configContent,params,spark)
try{
job.execute()
}finally{
job.close()
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy