All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.streampark.common.util.HadoopUtils.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.streampark.common.util

import org.apache.streampark.common.conf.{CommonConfig, InternalConfigHolder}
import org.apache.streampark.common.conf.ConfigConst._

import org.apache.commons.collections.CollectionUtils
import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._
import org.apache.hadoop.hdfs.DistributedFileSystem
import org.apache.hadoop.security.UserGroupInformation
import org.apache.hadoop.service.Service.STATE
import org.apache.hadoop.yarn.api.records.ApplicationId
import org.apache.hadoop.yarn.client.api.YarnClient
import org.apache.hadoop.yarn.conf.YarnConfiguration

import javax.security.auth.kerberos.KerberosTicket

import java.io.{File, IOException}
import java.security.PrivilegedAction
import java.util
import java.util.{Timer, TimerTask}
import java.util.concurrent._

import scala.collection.JavaConversions._
import scala.util.{Failure, Success, Try}

object HadoopUtils extends Logger {

  private[this] lazy val HADOOP_HOME: String = "HADOOP_HOME"
  private[this] lazy val HADOOP_CONF_DIR: String = "HADOOP_CONF_DIR"
  private[this] lazy val CONF_SUFFIX: String = "/etc/hadoop"

  private[this] var reusableYarnClient: YarnClient = _

  private[this] var ugi: UserGroupInformation = _

  private[this] var reusableConf: Configuration = _

  private[this] var reusableHdfs: FileSystem = _

  private[this] var tgt: KerberosTicket = _

  private[this] lazy val configurationCache: util.Map[String, Configuration] =
    new ConcurrentHashMap[String, Configuration]()

  def getUgi(): UserGroupInformation = {
    if (ugi == null) {
      ugi = if (HadoopConfigUtils.kerberosEnable) {
        getKerberosUGI()
      } else {
        UserGroupInformation.createRemoteUser(HadoopConfigUtils.hadoopUserName)
      }
    }
    ugi
  }

  private[this] lazy val hadoopConfDir: String = Try(
    FileUtils.getPathFromEnv(HADOOP_CONF_DIR)) match {
    case Failure(_) => FileUtils.resolvePath(FileUtils.getPathFromEnv(HADOOP_HOME), CONF_SUFFIX)
    case Success(value) => value
  }

  private[this] lazy val tgtRefreshTime: Long = {
    val user = UserGroupInformation.getLoginUser
    val method = classOf[UserGroupInformation].getDeclaredMethod("getTGT")
    method.setAccessible(true)
    tgt = method.invoke(user).asInstanceOf[KerberosTicket]
    Option(tgt) match {
      case Some(value) =>
        val start = value.getStartTime.getTime
        val end = value.getEndTime.getTime
        ((end - start) * 0.90f).toLong
      case _ =>
        logWarn("get kerberos tgtRefreshTime failed, try get kerberos.ttl. ")
        val timeUnit = DateUtils.getTimeUnit(InternalConfigHolder.get(CommonConfig.KERBEROS_TTL))
        timeUnit._2 match {
          case TimeUnit.SECONDS => timeUnit._1 * 1000
          case TimeUnit.MINUTES => timeUnit._1 * 60 * 1000
          case TimeUnit.HOURS => timeUnit._1 * 60 * 60 * 1000
          case TimeUnit.DAYS => timeUnit._1 * 60 * 60 * 24 * 1000
          case _ =>
            throw new IllegalArgumentException(
              s"[StreamPark] parameter:${CommonConfig.KERBEROS_TTL.key} invalided, unit options are [s|m|h|d]")
        }
    }
  }

  def getConfigurationFromHadoopConfDir(confDir: String = hadoopConfDir): Configuration = {
    if (!configurationCache.containsKey(confDir)) {
      FileUtils.exists(confDir)
      val hadoopConfDir = new File(confDir)
      val confName = List("hdfs-default.xml", "core-site.xml", "hdfs-site.xml", "yarn-site.xml")
      val files =
        hadoopConfDir.listFiles().filter(x => x.isFile && confName.contains(x.getName)).toList
      val conf = new HadoopConfiguration()
      if (CollectionUtils.isNotEmpty(files)) {
        files.foreach(x => conf.addResource(new Path(x.getAbsolutePath)))
      }
      configurationCache.put(confDir, conf)
    }
    configurationCache(confDir)
  }

  /**
   * 
 Note: There are two ways to load a hadoop configuration file:
1) Copy * core-site.xml,hdfs-site.xml,yarn-site.xml of hadoop conf to the resource dir.
2) * Automatically goes to $HADOOP_HOME/etc/hadoop to load the configuration.
We recommend the * second method, without copying the configuration files.
*/ def hadoopConf: Configuration = Option(reusableConf).getOrElse { reusableConf = getConfigurationFromHadoopConfDir(hadoopConfDir) // add hadoopConfDir to classpath...you know why??? Try(ClassLoaderUtils.loadResource(hadoopConfDir)) match { case Failure(e) => logWarn(s"Load hadoop resource to classpath failed. $e") case _ => } if (StringUtils.isBlank(reusableConf.get("hadoop.tmp.dir"))) { reusableConf.set("hadoop.tmp.dir", "/tmp") } if (StringUtils.isBlank(reusableConf.get("hbase.fs.tmp.dir"))) { reusableConf.set("hbase.fs.tmp.dir", "/tmp") } // disable timeline service as we only query yarn app here. // Otherwise we may hit this kind of ERROR: // java.lang.ClassNotFoundException: com.sun.jersey.api.client.config.ClientConfig reusableConf.set("yarn.timeline-service.enabled", "false") reusableConf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName) reusableConf.set("fs.file.impl", classOf[LocalFileSystem].getName) reusableConf.set("fs.hdfs.impl.disable.cache", "true") reusableConf } private[this] def closeHadoop(): Unit = { if (reusableHdfs != null) { reusableHdfs.close() reusableHdfs = null } if (reusableYarnClient != null) { reusableYarnClient.close() reusableYarnClient = null } if (tgt != null && !tgt.isDestroyed) { tgt.destroy() tgt = null } reusableConf = null ugi = null } private[this] def getKerberosUGI(): UserGroupInformation = { logInfo("kerberos login starting....") require( HadoopConfigUtils.kerberosPrincipal.nonEmpty && HadoopConfigUtils.kerberosKeytab.nonEmpty, s"$KEY_SECURITY_KERBEROS_PRINCIPAL and $KEY_SECURITY_KERBEROS_KEYTAB must not be empty" ) System.setProperty("javax.security.auth.useSubjectCredsOnly", "false") if (HadoopConfigUtils.kerberosKrb5.nonEmpty) { System.setProperty("java.security.krb5.conf", HadoopConfigUtils.kerberosKrb5) System.setProperty("java.security.krb5.conf.path", HadoopConfigUtils.kerberosKrb5) } System.setProperty("sun.security.spnego.debug", HadoopConfigUtils.kerberosDebug) System.setProperty("sun.security.krb5.debug", HadoopConfigUtils.kerberosDebug) hadoopConf.set(KEY_HADOOP_SECURITY_AUTHENTICATION, KEY_KERBEROS) Try { UserGroupInformation.setConfiguration(hadoopConf) val ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI( HadoopConfigUtils.kerberosPrincipal, HadoopConfigUtils.kerberosKeytab) UserGroupInformation.setLoginUser(ugi) logInfo("kerberos authentication successful") ugi } match { case Success(ugi) => ugi case Failure(e) => throw e } } def hdfs: FileSystem = { Option(reusableHdfs).getOrElse { reusableHdfs = Try { getUgi().doAs[FileSystem](new PrivilegedAction[FileSystem]() { // scalastyle:off FileSystemGet override def run(): FileSystem = FileSystem.get(hadoopConf) // scalastyle:on FileSystemGet }) } match { case Success(fs) => if (HadoopConfigUtils.kerberosEnable) { // reLogin... val timer = new Timer() timer.schedule( new TimerTask { override def run(): Unit = { closeHadoop() logInfo( s"Check Kerberos Tgt And reLogin From Keytab Finish:refresh time: ${DateUtils.format()}") } }, tgtRefreshTime, tgtRefreshTime ) } fs case Failure(e) => throw new IllegalArgumentException(s"[StreamPark] access hdfs error: $e") } reusableHdfs } } def yarnClient: YarnClient = { if (reusableYarnClient == null || !reusableYarnClient.isInState(STATE.STARTED)) { reusableYarnClient = Try { getUgi().doAs(new PrivilegedAction[YarnClient]() { override def run(): YarnClient = { val yarnConf = new YarnConfiguration(hadoopConf); val client = YarnClient.createYarnClient; client.init(yarnConf); client.start(); client } }) } match { case Success(client) => client case Failure(e) => throw new IllegalArgumentException(s"[StreamPark] access yarnClient error: $e") } } reusableYarnClient } def toApplicationId(appId: String): ApplicationId = { require( appId != null, "[StreamPark] HadoopUtils.toApplicationId: applicationId muse not be null") val timestampAndId = appId.split("_") ApplicationId.newInstance(timestampAndId(1).toLong, timestampAndId.last.toInt) } @throws[IOException] def downloadJar(jarOnHdfs: String): String = { val tmpDir = FileUtils.createTempDir() val fs = FileSystem.get(new Configuration) val sourcePath = fs.makeQualified(new Path(jarOnHdfs)) if (!fs.exists(sourcePath)) throw new IOException(s"jar file: $jarOnHdfs doesn't exist.") val destPath = new Path(tmpDir.getAbsolutePath + "/" + sourcePath.getName) fs.copyToLocalFile(sourcePath, destPath) new File(destPath.toString).getAbsolutePath } private class HadoopConfiguration extends Configuration { private lazy val rewriteNames = List( "dfs.blockreport.initialDelay", "dfs.datanode.directoryscan.interval", "dfs.heartbeat.interval", "dfs.namenode.decommission.interval", "dfs.namenode.replication.interval", "dfs.namenode.checkpoint.period", "dfs.namenode.checkpoint.check.period", "dfs.client.datanode-restart.timeout", "dfs.ha.log-roll.period", "dfs.ha.tail-edits.period", "dfs.datanode.bp-ready.timeout" ) private def getHexDigits(value: String): String = { var negative = false var str = value var hexString: String = null if (value.startsWith("-")) { negative = true str = value.substring(1) } if (str.startsWith("0x") || str.startsWith("0X")) { hexString = str.substring(2) if (negative) hexString = "-" + hexString return hexString } null } // HDFS default value change (with adding time unit) breaks old version MR tarball work with Hadoop 3.x // detail: https://issues.apache.org/jira/browse/HDFS-12920 private def getSafeValue(name: String): String = { val value = getTrimmed(name) if (rewriteNames.contains(name)) { return value.replaceFirst("s$", "") } value } override def getLong(name: String, defaultValue: Long): Long = { val valueString = getSafeValue(name) valueString match { case null => defaultValue case v => val hexString = getHexDigits(v) if (hexString != null) { return java.lang.Long.parseLong(hexString, 16) } java.lang.Long.parseLong(valueString) } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy