All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ru.shubert.yt.YouTubeQuery.scala Maven / Gradle / Ivy

The newest version!
package ru.shubert.yt

import _root_.java.io.{BufferedReader, InputStreamReader}
import _root_.java.net.URLDecoder
import _root_.java.nio.charset.StandardCharsets
import java.security.MessageDigest

import scala.collection.JavaConverters._
import cats.MonadError
import cats.implicits._
import com.typesafe.scalalogging.Logger
import _root_.org.apache.http.NameValuePair
import _root_.org.apache.http.message.BasicNameValuePair
import io.circe.HCursor

import org.apache.http.client.config.RequestConfig
import org.apache.http.client.methods.{CloseableHttpResponse, HttpGet}
import org.apache.http.client.utils.{HttpClientUtils, URLEncodedUtils}
import org.apache.http.impl.client.HttpClients

import scala.collection.mutable
import scala.util.matching.UnanchoredRegex
import scala.util.{Failure, Success, Try}
import scala.language.higherKinds

import io.circe._, io.circe.parser._

/**
  * YouTube obscures download links, requiring urls with special signature in it.
  *
  * just 4k video  http://www.youtube.com/watch?v=Cx6eaVeYXOs
  *
  * prefix `s=`  https://www.youtube.com/watch?v=UxxajLWwzqY | url_encoded_fmt_stream_map
  * normal `s=`  https://www.youtube.com/watch?v=UxxajLWwzqY | adaptive_fmts
  * normal `s=`  https://www.youtube.com/watch?v=8UVNT4wvIGY | url_encoded_fmt_stream_map
  */
class YouTubeQuery[F[_]](implicit M: MonadError[F, Throwable]) extends SignatureDecipher[F] {

  import YouTubeQuery._

  protected[yt] val ReqConfig: RequestConfig = RequestConfig.custom().setConnectionRequestTimeout(5000).setConnectTimeout(5000).setRedirectsEnabled(true).build()

  //noinspection ConvertExpressionToSAM
  //  implicit val ordering: Ordering[NameValuePair] = (x: NameValuePair, y: NameValuePair) => x.getName.compare(y.getName)
  protected implicit val ordering: Ordering[NameValuePair] = new Ordering[NameValuePair] {
    override def compare(x: NameValuePair, y: NameValuePair): Int = x.getName.compare(y.getName)
  }

  case class SingleStream(urlExploded: Array[String], params: mutable.Buffer[NameValuePair])

  case class TagStream(itag: Try[String] = defaultITag,
                       signature: Try[String] = defaultSignature,
                       params: mutable.TreeSet[NameValuePair] = mutable.TreeSet[NameValuePair]())

  protected[yt] def readStringFromUrl(url: String): F[String] = {
    M.catchNonFatal {
      val method = new HttpGet(url)
      method.addHeader("Accept-Charset", StandardCharsets.UTF_8.name())
      method.addHeader("User-Agent", ModernBrowser)
      method.setConfig(ReqConfig)

      val client = HttpClients.createDefault()
      var resp: CloseableHttpResponse = null

      try {
        resp = client.execute(method)
        val status = resp.getStatusLine.getStatusCode
        if (status == 200) {
          logger.debug("Successful download for url {}", url)
          val stream = new BufferedReader(new InputStreamReader(resp.getEntity.getContent))
          val buffer = new StringBuilder
          Iterator.continually(stream.readLine()).takeWhile(_ != null).foreach(buffer.append)
          buffer.toString()
        } else {
          val msg = s"Error code $status while accessing $url"
          logger.debug(msg)
          throw YGNetworkException(msg)
        }
      } finally {
        HttpClientUtils.closeQuietly(client)
        HttpClientUtils.closeQuietly(resp)
      }
    }
  }

  protected def MD5(value: String): String = {
    val md5 = MessageDigest.getInstance("MD5")
    md5.update(value.getBytes(StandardCharsets.UTF_8))
    md5.digest().take(5).map("%02x".format(_)).mkString
  }

  /**
    * Extracts player config from a quite long javascript string.
    *
    * @param page where player should be found
    * @return json nodes wrapped in Success or Failure with exception
    */
  protected def getPlayerConfig(page: String): F[HCursor] = page match {
    case PlayerConfigRegex(streams) =>
      M.pure(parse(streams).getOrElse(Json.Null).hcursor)
    case _ =>
      logger.error("Unable to extract player config from (first 300) " + page.take(300))
      M.raiseError(YGParseException("Player script was changed: " + page))
  }

  // Extract video+audio streams and converts from escaped to plain
  protected def extractStreamsUrl(cfg: HCursor): F[StreamsHolder] = {
    //TODO: lost exception
    import cats.implicits._
    def extract(name: String, doc: ACursor): Option[String] = {
      doc.get[String](name).bimap(
        e => {
          logger.error(s"Failed to extract $name", e)
          e
        },
        StringContext.treatEscapes)
        .toOption
    }

    val root = cfg.downField("args")
    val vf = M.catchNonFatal(extract("url_encoded_fmt_stream_map", root))
    val af = M.catchNonFatal(extract("adaptive_fmts", root))

    for {
      video ← vf
      adaptive ← af
    } yield {
      logger.debug("Video streams?: {} \n adaptive? {}", video.isDefined, adaptive.isDefined)
      StreamsHolder(video, adaptive)
    }
  }

  protected def getPlayerUrl(cfg: HCursor): F[String] =
    M.fromEither(cfg.downField("assets")
      .get[String]("js")
      .map(URLDecoder.decode(_, StandardCharsets.UTF_8.name())))


  /**
    * Each stream is described by [header][\s][url with some params]
    * Signature might be put either in header or url and might be of 3 types :
    * 
    *
  • `signature` - this one is plain and need no decipher
  • *
  • `s` - ciphered
  • *
  • `sig` - ciphered
  • *
* * Obscuring function is determined by player provided and may change not with time only but even with different videos. * I.e. some of videos are bound to special player revision. * * @param urls block of urls * @param decipher decipher function * @return Map of videoType to url relations */ protected def buildDownloadLinks(urls: String, decipher: String ⇒ String): Seq[(Int, String)] = { val md5 = MD5(urls) def getSingleStream(desc: String) = { // Why so complicate? Youtube servers rejects requests with : 1.duplicate tags (!!!), 2.with + replaced with ' ', 3.on some urldecodings. // So here we doing our best not to interfere with params. val params1 = URLEncodedUtils.parse(desc, StandardCharsets.UTF_8).asScala val splitUrl = params1.partition(_.getName == "url") val urlExploded = splitUrl._1.head.getValue.split("\\?") val decodedLine = URLDecoder.decode(urlExploded(1), StandardCharsets.UTF_8.name()) val params = splitUrl._2 ++ URLEncodedUtils.parse(decodedLine, StandardCharsets.UTF_8).asScala SingleStream(urlExploded, params) } urls.split(",") flatMap { desc => logger.debug(s"For $md5 parsed url $desc") val singleStream: SingleStream = getSingleStream(desc) val urlExploded: Array[String] = singleStream.urlExploded val taggedStream = singleStream.params.foldLeft(TagStream()) { case (acc, pair) ⇒ pair.getName match { case "signature" ⇒ acc.copy(signature = Success(pair.getValue)) case "s" | "sign" ⇒ acc.copy(signature = Try(decipher(pair.getValue))) case "itag" ⇒ acc.copy(itag = Success(pair.getValue)) case _ ⇒ // since 2016 youtube denies urls with empty params. if (pair.getValue != null && !pair.getValue.trim.isEmpty) { acc.params.add(pair) } acc } } logger.debug(s"For $md5 params are ${taggedStream.params}") (for { tag ← taggedStream.itag sig ← taggedStream.signature } yield { taggedStream.params.add(new BasicNameValuePair("sig", sig)) taggedStream.params.add(new BasicNameValuePair("itag", tag)) val link = urlExploded(0) + "?" + URLEncodedUtils.format(taggedStream.params.toList.asJava, StandardCharsets.UTF_8) tag.toInt -> link }).toOption } } /** * Parse streams from whole page represented by string. * * @param page youtube video html page * @return optional map of streams */ def getStreamsFromString(page: String): F[Map[Int, String]] = { for { cfg <- getPlayerConfig(page) streamsF = extractStreamsUrl(cfg) playerUrlF = getPlayerUrl(cfg) streams ← streamsF playerUlr ← playerUrlF decipher ← registerPlayer(playerUlr, readStringFromUrl) videoF = M.catchNonFatal(streams.video.map(buildDownloadLinks(_, decipher))) adaptiveF = M.catchNonFatal(streams.adaptive.map(buildDownloadLinks(_, decipher))) video ← videoF adaptive ← adaptiveF } yield { // Option[Seq[(Int, String)]] (video ++ adaptive) .flatten .toMap } } /** * Scala oriented method that returns possible video streams. * * @param url video url of form `https://www.youtube.com/watch?v=ecekSCX3B4Q` * @return option contains map of type to video url */ def getStreams(url: String): F[Map[Int, String]] = readStringFromUrl(url).flatMap(getStreamsFromString) } object YouTubeQuery { private val logger = Logger(getClass.getName) lazy val ModernBrowser = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12) AppleWebKit/602.1.50 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 Firefox/62.0" val PlayerConfigRegex: UnanchoredRegex = """(?i)ytplayer\.config\s*=\s*(\{.*\});\s*ytplayer\.load""".r.unanchored lazy val defaultITag = Failure(YGParseException("itag not found")) lazy val defaultSignature = Failure(YGParseException("subscription not found")) lazy val unableToExtractJsException = throw YGParseException("Failed to extract js") }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy