
ru.shubert.yt.YouTubeQuery.scala Maven / Gradle / Ivy
The newest version!
package ru.shubert.yt
import _root_.java.io.{BufferedReader, InputStreamReader}
import _root_.java.net.URLDecoder
import _root_.java.nio.charset.StandardCharsets
import java.security.MessageDigest
import java.util.concurrent.{TimeUnit, Future ⇒ JFuture}
import java.util.{Map ⇒ JMap}
import scala.collection.JavaConverters._
import _root_.org.apache.http.NameValuePair
import _root_.org.apache.http.message.BasicNameValuePair
import com.fasterxml.jackson.databind.{JsonNode, ObjectMapper}
import org.apache.commons.lang3.StringEscapeUtils
import org.apache.http.client.config.RequestConfig
import org.apache.http.client.methods.{CloseableHttpResponse, HttpGet}
import org.apache.http.client.utils.{HttpClientUtils, URLEncodedUtils}
import org.apache.http.impl.client.HttpClients
import ru.shubert.yt.Decipher.DecipherFunction
import scala.collection.mutable
import scala.concurrent.duration.{Duration, FiniteDuration}
import scala.concurrent.{Await, Future}
import scala.concurrent.ExecutionContext.Implicits.global
import scala.util.{Failure, Success, Try}
/**
* YouTube obscures download links, requiring urls with special signature in it.
*
* just 4k video http://www.youtube.com/watch?v=Cx6eaVeYXOs
*
* prefix `s=` https://www.youtube.com/watch?v=UxxajLWwzqY | url_encoded_fmt_stream_map
* normal `s=` https://www.youtube.com/watch?v=UxxajLWwzqY | adaptive_fmts
* normal `s=` https://www.youtube.com/watch?v=8UVNT4wvIGY | url_encoded_fmt_stream_map
*/
object YouTubeQuery extends Loggable {
private val mapper = new ObjectMapper()
private val ModernBrowser = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.0.0 Safari/537.11 Firefox/34.0"
private val PlayerConfigRegex = """(?i)ytplayer\.config\s*=\s*(\{.*\});\s*ytplayer\.load""".r.unanchored
protected[yt] val ReqConfig: RequestConfig = RequestConfig.custom().setConnectionRequestTimeout(5000).setConnectTimeout(5000)
.setRedirectsEnabled(true).build()
// implicit val ordering: Ordering[NameValuePair] = (x: NameValuePair, y: NameValuePair) => x.getName.compare(y.getName)
implicit val ordering: Ordering[NameValuePair] = new Ordering[NameValuePair] {
override def compare(x: NameValuePair, y: NameValuePair) = x.getName.compare(y.getName)
}
case class SingleStream(urlExploded: Array[String], params: mutable.Buffer[NameValuePair])
case class TagStream(itag: Try[String] = Failure(new YGParseException("itag not found")),
subscription: Try[String] = Failure(new YGParseException("subscription not found")),
params: mutable.TreeSet[NameValuePair] = mutable.TreeSet[NameValuePair]())
protected[yt] def readStringFromUrl(url: String): Future[String] = Future {
val method = new HttpGet(url)
method.addHeader("Accept-Charset", StandardCharsets.UTF_8.name())
method.addHeader("User-Agent", ModernBrowser)
method.setConfig(ReqConfig)
val client = HttpClients.createDefault()
var resp: CloseableHttpResponse = null
try {
resp = client.execute(method)
val status = resp.getStatusLine.getStatusCode
if (status == 200) {
LOG.debug("Successful download for url {}", url)
val stream = new BufferedReader(new InputStreamReader(resp.getEntity.getContent))
val buffer = new StringBuilder
Iterator.continually(stream.readLine()).takeWhile(_ != null).foreach(buffer.append)
buffer.toString()
} else {
val msg = s"Error code $status while accessing $url"
LOG.debug(msg)
throw new YGNetworkException(msg)
}
} finally {
HttpClientUtils.closeQuietly(client)
HttpClientUtils.closeQuietly(resp)
}
}
private def MD5(value: String) = {
val md5 = MessageDigest.getInstance("MD5")
md5.update(value.getBytes(StandardCharsets.UTF_8))
md5.digest().take(5).map("%02x".format(_)).mkString
}
/**
* Extracts player config from a quite long javascript string.
*
* @param page where player should be found
* @return json nodes wrapped in Success or Failure with exception
*/
private def getPlayerConfig(page: String): Future[JsonNode] = page match {
case PlayerConfigRegex(streams) => Future(mapper.readTree(streams))
case _ =>
LOG.debug("Unable to extract player config from (first 300) " + page.take(300))
Future.failed(new YGParseException("Player script was changed: " + page))
}
// Extract video+audio streams and converts from escaped to plain
private def extractStreamsUrl(cfg: JsonNode): Future[StreamsHolder] = {
val root = cfg.path("args")
def extract(name: String, doc: JsonNode): Option[String] = Option(doc.path(name).asText(null)).map(StringEscapeUtils.unescapeJava)
val vf = Future(extract("url_encoded_fmt_stream_map", root))
val af = Future(extract("adaptive_fmts", root))
for {
video ← vf
adaptive ← af
} yield {
LOG.trace("Video streams?: {} \n adaptive? {}", video.isDefined, adaptive.isDefined)
StreamsHolder(video, adaptive)
}
}
private def getPlayerUrl(cfg: JsonNode): Future[String] =
Future(Option(cfg.path("assets").path("js").asText(null)).map(URLDecoder.decode(_, StandardCharsets.UTF_8.name())).getOrElse(throw new YGParseException("")))
/**
* Each stream is described by [header][\s][url with some params]
* Signature might be put either in header or url and might be of 3 types :
*
* - `signature` - this one is plain and need no decipher
* - `s` - ciphered
* - `sig` - ciphered
*
*
* Obscuring function is determined by player provided and may change not with time only but even with different videos.
* I.e. some of videos are bound to special player revision.
*
* @param urls block of urls
* @param decipher decipher function
* @return Map of videoType to url relations
*/
private def buildDownloadLinks(urls: String, decipher: DecipherFunction): Seq[Try[(Int, String)]] = {
val md5 = MD5(urls)
def getSingleStream(desc: String) = {
// Why so complicate? Youtube servers rejects requests with : 1.duplicate tags (!!!), 2.with + replaced with ' ', 3.on some urldecodings.
// So here we doing our best not to interfere with params.
val params1 = URLEncodedUtils.parse(desc, StandardCharsets.UTF_8).asScala
val splitUrl = params1.partition(_.getName == "url")
val urlExploded = splitUrl._1.head.getValue.split("\\?")
val decodedLine = URLDecoder.decode(urlExploded(1), StandardCharsets.UTF_8.name())
val params = splitUrl._2 ++ URLEncodedUtils.parse(decodedLine, StandardCharsets.UTF_8).asScala
SingleStream(urlExploded, params)
}
urls.split(",") map { desc =>
LOG.debug(s"For $md5 parsed url $desc")
val singleStream: SingleStream = getSingleStream(desc)
val urlExploded: Array[String] = singleStream.urlExploded
val taggedStream = singleStream.params.foldLeft(TagStream()) { case (acc, pair) ⇒
pair.getName match {
case "signature" ⇒ acc.copy(subscription = Success(pair.getValue))
case "s" | "sign" ⇒ acc.copy(subscription = decipher.flatMap(f ⇒ f(pair.getValue)))
case "itag" ⇒ acc.copy(itag = Success(pair.getValue))
case _ ⇒ // since 2016 youtube denies urls with empty params.
if(pair.getValue != null && !pair.getValue.trim.isEmpty){
acc.params.add(pair)
}
acc
}
}
LOG.debug(s"For $md5 params are ${taggedStream.params}")
for {
tag ← taggedStream.itag
sig ← taggedStream.subscription
} yield {
taggedStream.params.add(new BasicNameValuePair("signature", sig))
taggedStream.params.add(new BasicNameValuePair("itag", tag))
val link = urlExploded(0) + "?" + URLEncodedUtils.format(taggedStream.params.toList.asJava, StandardCharsets.UTF_8)
tag.toInt -> link
}
}
}
/**
* Parse streams from whole page represented by string.
*
* @param page youtube video html page
* @return optional map of streams
*/
def getStreamsFromString(page: String): Future[Map[Int, String]] = {
getPlayerConfig(page).flatMap { cfg ⇒
val streamsF = extractStreamsUrl(cfg)
val playerUrlF = getPlayerUrl(cfg)
for {
streams ← streamsF
playerUlr ← playerUrlF
decipher: DecipherFunction = Decipher.registerPlayer(playerUlr, readStringFromUrl)
videoF = Future(streams.video.map(buildDownloadLinks(_, decipher)))
adaptiveF = Future(streams.adaptive.map(buildDownloadLinks(_, decipher)))
video ← videoF
adaptive ← adaptiveF
} yield {
// Option[Seq[Try[(Int, String)]
(video ++ adaptive)
.flatten
.flatMap(_.toOption)
.toMap
}
}
}
/**
* Scala oriented method that returns possible video streams.
*
* @param url video url of form `https://www.youtube.com/watch?v=ecekSCX3B4Q`
* @return option contains map of type to video url
*/
def getStreams(url: String): Future[Map[Int, String]] = readStringFromUrl(url).flatMap(getStreamsFromString)
/**
* same as getStreams, but returns empty java map if nothing found
*
* @param url video url from youtube
* @return map of type to url
*/
def getJavaStreams(url: String): JFuture[JMap[Int, String]] = {
class MyFuture(future: Future[Map[Int, String]]) extends JFuture[JMap[Int, String]] {
override def cancel(mayInterruptIfRunning: Boolean): Boolean = throw new NotImplementedError
override def isCancelled: Boolean = false
override def isDone: Boolean = future.isCompleted
override def get(): JMap[Int, String] = Await.result(future, Duration.Inf).asJava
override def get(timeout: Long, unit: TimeUnit): JMap[Int, String] = Await.result(future, FiniteDuration(timeout, unit)).asJava
}
new MyFuture(getStreams(url))
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy