io.gatling.http.fetch.HtmlParser.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2011-2024 GatlingCorp (https://gatling.io)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.gatling.http.fetch
import java.util.Locale
import scala.collection.mutable
import scala.util.control.NonFatal
import io.gatling.commons.util.Throwables._
import io.gatling.core.check.css.Lagarto
import io.gatling.http.client.uri.Uri
import io.gatling.http.util.HttpHelper
import com.typesafe.scalalogging.StrictLogging
import jodd.lagarto.{ EmptyTagVisitor, Tag, TagType }
import jodd.util.CharSequenceUtil
private[fetch] sealed abstract class RawResource {
def rawUrl: String
def uri(rootURI: Uri): Option[Uri] = HttpHelper.resolveFromUriSilently(rootURI, rawUrl)
def toEmbeddedResource(rootURI: Uri): Option[ConcurrentResource]
}
private[fetch] final case class CssRawResource(rawUrl: String) extends RawResource {
def toEmbeddedResource(rootURI: Uri): Option[ConcurrentResource] = uri(rootURI).map(CssResource)
}
private[fetch] final case class RegularRawResource(rawUrl: String) extends RawResource {
def toEmbeddedResource(rootURI: Uri): Option[ConcurrentResource] = uri(rootURI).map(BasicResource)
}
private[fetch] final case class HtmlResources(rawResources: Seq[RawResource], base: Option[String])
private[gatling] object HtmlParser extends StrictLogging {
private val AppletTagName = "applet"
private val BaseTagName = "base"
private val BgsoundTagName = "bgsound"
private val BodyTagName = "body"
private val EmbedTagName = "embed"
private val ImgTagName = "img"
private val InputTagName = "input"
private val LinkTagName = "link"
private val ObjectTagName = "object"
private val StyleTagName = "style"
private val ArchiveAttribute = "archive"
private val BackgroundAttribute = "background"
private val CodeAttribute = "code"
private val CodeBaseAttribute = "codebase"
private val DataAttribute = "data"
private val HrefAttribute = "href"
private val RelAttribute = "rel"
private val SrcAttribute = "src"
private val StyleAttribute = StyleTagName
private val StylesheetRelValue = "stylesheet"
private val PrefetchRelValue = "prefetch"
private val IconRelValue = "icon"
private val ShortcutIconRelValue = "shortcut icon"
def logException(htmlContent: Array[Char], e: Throwable): Unit =
if (logger.underlying.isDebugEnabled)
logger.debug(
s"""HTML parser crashed, there's a chance your page wasn't proper HTML:
>>>>>>>>>>>>>>>>>>>>>>>
${new String(htmlContent)}
<<<<<<<<<<<<<<<<<<<<<<<""",
e
)
else
logger.error(
s"HTML parser crashed: ${e.rootMessage}, there's a chance your page wasn't proper HTML, enable debug on 'io.gatling.http.fetch' logger to get the HTML content",
e
)
}
class HtmlParser extends StrictLogging {
import HtmlParser._
var inStyle = false
private def parseHtml(htmlContent: Array[Char]): HtmlResources = {
var base: Option[String] = None
val rawResources = mutable.ArrayBuffer.empty[RawResource]
val visitor: EmptyTagVisitor = new EmptyTagVisitor {
def addResource(tag: Tag, attributeName: String, factory: String => RawResource): Unit =
Option(tag.getAttributeValue(attributeName)).foreach { url =>
rawResources += factory(url.toString)
}
override def script(tag: Tag, body: CharSequence): Unit =
addResource(tag, SrcAttribute, RegularRawResource)
override def text(text: CharSequence): Unit =
if (inStyle)
rawResources ++= CssParser.extractStyleImportsUrls(text).map(CssRawResource)
override def tag(tag: Tag): Unit = {
def codeBase(): Option[CharSequence] = Option(tag.getAttributeValue(CodeBaseAttribute))
def prependCodeBase(codeBase: CharSequence, url: String): String =
if (url.startsWith("http")) {
url
} else if (codeBase.charAt(codeBase.length()) != '/') {
s"$codeBase/$url"
} else {
s"$codeBase$url"
}
def processTag(): Unit =
tag.getType match {
case TagType.START | TagType.SELF_CLOSING =>
if (tag.isRawTag && tag.nameEquals(StyleTagName)) {
inStyle = true
} else if (tag.nameEquals(BaseTagName)) {
base = Option(tag.getAttributeValue(HrefAttribute)).map(_.toString)
} else if (tag.nameEquals(LinkTagName)) {
Option(tag.getAttributeValue(RelAttribute)).map(_.toString.toLowerCase(Locale.ROOT)) match {
case Some(StylesheetRelValue) =>
addResource(tag, HrefAttribute, CssRawResource)
case Some(PrefetchRelValue) if tag.getAttributeValue(HrefAttribute).toString.contains(".css") =>
addResource(tag, HrefAttribute, CssRawResource)
case Some(IconRelValue) | Some(ShortcutIconRelValue) | Some(PrefetchRelValue) =>
addResource(tag, HrefAttribute, RegularRawResource)
case _ =>
}
} else if (
tag.nameEquals(ImgTagName) ||
tag.nameEquals(BgsoundTagName) ||
tag.nameEquals(EmbedTagName) ||
tag.nameEquals(InputTagName)
) {
addResource(tag, SrcAttribute, RegularRawResource)
} else if (tag.nameEquals(BodyTagName)) {
addResource(tag, BackgroundAttribute, RegularRawResource)
} else if (tag.nameEquals(AppletTagName)) {
val code = tag.getAttributeValue(CodeAttribute).toString
val archives = Option(tag.getAttributeValue(ArchiveAttribute)).map(_.toString.split(",").view.map(_.trim).to(Seq))
val appletResources = archives.getOrElse(code :: Nil).iterator
val appletResourcesUrls = codeBase() match {
case Some(cb) => appletResources.map(prependCodeBase(cb, _))
case _ => appletResources
}
rawResources ++= appletResourcesUrls.map(RegularRawResource)
} else if (tag.nameEquals(ObjectTagName)) {
Option(tag.getAttributeValue(DataAttribute)).foreach { data =>
val objectResourceUrl = codeBase() match {
case Some(cb) => prependCodeBase(cb, data.toString)
case _ => data.toString
}
rawResources += RegularRawResource(objectResourceUrl)
}
} else {
Option(tag.getAttributeValue(StyleAttribute)).foreach { style =>
val styleUrls = CssParser.extractInlineStyleImageUrls(style).map(RegularRawResource)
rawResources ++= styleUrls
}
}
case TagType.END =>
if (inStyle && tag.nameEquals(StyleTagName))
inStyle = false
case _ =>
}
processTag()
}
}
try {
Lagarto.newLagartoParser(htmlContent).parse(visitor)
} catch { case NonFatal(e) => logException(htmlContent, e) }
HtmlResources(rawResources.toSeq, base)
}
def getEmbeddedResources(documentURI: Uri, htmlContent: Array[Char]): List[ConcurrentResource] = {
val htmlResources = parseHtml(htmlContent)
val rootURI = htmlResources.base.map(Uri.create(documentURI, _)).getOrElse(documentURI)
htmlResources.rawResources.view.distinct
.filterNot(res => res.rawUrl.isEmpty || res.rawUrl.charAt(0) == '#' || res.rawUrl.startsWith("data:"))
.flatMap(_.toEmbeddedResource(rootURI).toList)
.to(List)
}
}