All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.gatling.http.fetch.HtmlParser.scala Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2011-2014 eBusiness Information, Groupe Excilys (www.ebusinessinformation.fr)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * 		http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.gatling.http.fetch

import java.net.{ URI, URISyntaxException }

import scala.collection.{ breakOut, mutable }

import com.typesafe.scalalogging.slf4j.StrictLogging

import io.gatling.http.util.HttpHelper
import jodd.lagarto.{ EmptyTagVisitor, LagartoParser, Tag }

sealed abstract class RawResource {
  def rawUrl: String
  def uri(rootURI: URI): Option[URI] = HttpHelper.resolveFromURISilently(rootURI, rawUrl)
  def toEmbeddedResource(rootURI: URI): Option[EmbeddedResource]
}
case class CssRawResource(rawUrl: String) extends RawResource {
  def toEmbeddedResource(rootURI: URI): Option[EmbeddedResource] = uri(rootURI).map(CssResource)
}
case class RegularRawResource(rawUrl: String) extends RawResource {
  def toEmbeddedResource(rootURI: URI): Option[EmbeddedResource] = uri(rootURI).map(RegularResource)
}

object HtmlParser extends StrictLogging {

  def getEmbeddedResources(documentURI: URI, htmlContent: Array[Char]): List[EmbeddedResource] = {

    val rawResources = mutable.ArrayBuffer.empty[RawResource]
    var baseURI: Option[URI] = None

    val lagartoParser = new LagartoParser(htmlContent)

    val visitor = new EmptyTagVisitor {

      def addResource(tag: Tag, attributeName: String, factory: String => RawResource) {
        val url = tag.getAttributeValue(attributeName, false)
        if (url != null)
          rawResources += factory(url)
      }

      override def script(tag: Tag, body: CharSequence) {
        addResource(tag, "src", RegularRawResource)
      }

      override def style(tag: Tag, body: CharSequence) {
        rawResources ++= CssParser.extractUrls(body, CssParser.styleImportsUrls).map(CssRawResource)
      }

      override def tag(tag: Tag) {

          def suffixedCodeBase() = Option(tag.getAttributeValue("codebase", false)).map { cb =>
            if (cb.charAt(cb.size) != '/')
              cb + '/'
            else
              cb
          }

          def prependCodeBase(url: String, codeBase: String) =
            if (url.charAt(0) != 'h')
              codeBase + url
            else
              url

        tag.getName.toLowerCase match {
          case "base" =>
            val baseHref = Option(tag.getAttributeValue("href", false))
            baseURI = try {
              baseHref.map(new URI(_))
            } catch {
              case e: URISyntaxException =>
                logger.debug(s"Malformed baseHref ${baseHref.get}")
                None
            }

          case "link" =>
            tag.getAttributeValue("rel", false) match {
              case "stylesheet" => addResource(tag, "href", CssRawResource)
              case "icon"       => addResource(tag, "href", RegularRawResource)
              case _            =>
            }

          case "bgsound" => addResource(tag, "src", RegularRawResource)
          case "img"     => addResource(tag, "src", RegularRawResource)
          case "embed"   => addResource(tag, "src", RegularRawResource)
          case "input"   => addResource(tag, "src", RegularRawResource) // only if type=image?
          case "body"    => addResource(tag, "background", RegularRawResource)

          case "applet" =>
            val code = tag.getAttributeValue("code", false)
            val archives = Option(tag.getAttributeValue("archive", false)).map(_.split(",").map(_.trim)(breakOut))

            val appletResources = archives.getOrElse(List(code)).iterator
            val appletResourcesUrls = suffixedCodeBase() match {
              case Some(cb) => appletResources.map(prependCodeBase(cb, _))
              case None     => appletResources
            }
            rawResources ++= appletResourcesUrls.map(RegularRawResource)

          case "object" =>
            val data = tag.getAttributeValue("data", false)
            val objectResourceUrl = suffixedCodeBase() match {
              case Some(cb) => prependCodeBase(cb, data)
              case _        => data
            }
            rawResources += RegularRawResource(objectResourceUrl)

          case _ =>
            Option(tag.getAttributeValue("style", false)).foreach { style =>
              val styleUrls = CssParser.extractUrls(style, CssParser.inlineStyleImageUrls).map(RegularRawResource)
              rawResources ++= styleUrls
            }
        }
      }
    }

    lagartoParser.parse(visitor)

    val rootURI = baseURI.getOrElse(documentURI)

    rawResources
      .distinct
      .flatMap(_.toEmbeddedResource(rootURI))
      .toList
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy