com.intenthq.gander.extractors.ContentExtractor.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gander_2.11 Show documentation
Extracts text, metadata from web pages.
The newest version!
package com.intenthq.gander.extractors

import java.net.URL
import java.text.Normalizer
import java.util.Date
import java.util.regex.Pattern

import com.intenthq.gander.Link
import com.intenthq.gander.text.{StopWords, WordStats}
import com.intenthq.gander.utils.JSoup._
import org.joda.time.DateTime
import org.jsoup.nodes.{Document, Element}
import org.slf4j.{Logger, LoggerFactory}

import scala.collection.convert.Wrappers.JListWrapper
import scala.collection.mutable
import scala.math._
import scala.util.Try
import org.joda.time.format.ISODateTimeFormat.dateTimeParser


object ContentExtractor {

  val logger: Logger = LoggerFactory.getLogger(getClass)

  def extractTitle(doc: Document): String =
    byTag("title")(doc).headOption.map(_.text).getOrElse("").replace("�", "").trim

  def processTitle(rawTitle: String, canonical: Option[String]): String = {
    def normalize(str: String) =
      Normalizer.normalize(str, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "")

    canonical.flatMap(c => Try(new URL(c)).toOption).flatMap { url =>
      val names = url.getAuthority.split('.').init.filter(_.length > 2).filter(_ != "www")
      List(""" | """, " • ", " › ", " :: ", " » ", " - ", " : ", " — ", " · ").collectFirst {
        case separator if rawTitle.contains(separator) =>
          val parts = rawTitle.split(Pattern.quote(separator))
          val partsNot = parts.filterNot { part =>
            names.exists(name => normalize(part).toLowerCase.replace(" ", "").contains(name))
          }
          partsNot.mkString(separator).trim
      }
    }.getOrElse(rawTitle)
  }

  def extractLang(doc: Document): Option[String] =
    byTag("html")(doc).headOption.map(_.attr("lang")).filter(_.nonEmpty).orElse(
      metaContent("http-equiv=Content-Language")(doc).orElse(
        metaContent("property=og:locale")(doc)
      )
    )

  def extractDate(doc: Document): Option[DateTime] = {
    metaContent("property=article:published_time")(doc).orElse(
      metaContent("name=DCTERMS.created")(doc).orElse(
        select("time[class=dt-published published entry-date]")(doc).headOption.map(_.attr("datetime").trim).orElse(
          select("time[itemprop=datePublished]")(doc).headOption.map(_.attr("datetime").trim).orElse(
            metaContent("name=DisplayDate")(doc).orElse(
              metaContent("name=date")(doc)
            )
          )
        )
      )
    ).flatMap(x =>
      // replaceAll("/","-") is needed as ISODateTimeFormat will block on /
      // e.g. http://www.bbc.co.uk/sport/0/football/34203622
      Try(dateTimeParser.parseDateTime(x.replaceAll("/","-"))).toOption
    )
  }

  private def metaContent(metaName: String)(implicit doc: Document): Option[String] =
    select(s"meta[$metaName]").headOption.map(_.attr("content").trim)

  /**
  * if the article has meta description set in the source, use that
  */
  def extractMetaDescription(implicit doc: Document): String =
    metaContent("name=description").orElse(
      metaContent("og:description").orElse(
        metaContent("name=twitter:description")
      )
    ).getOrElse("").trim

  /**
  * if the article has meta keywords set in the source, use that
  */
  def extractMetaKeywords(implicit doc: Document): String = metaContent("name=keywords").getOrElse("")

  /**
   * if the article has meta canonical link set in the url
   */
  def extractCanonicalLink(implicit doc: Document): Option[String] =
    select("link[rel=canonical]").headOption.map(_.attr("abs:href")).orElse(
      select("meta[property=og:url]").headOption.map(_.attr("abs:content"))
    ).orElse(
      select("meta[name=twitter:url]").headOption.map(_.attr("abs:content"))
    ).map(_.trim)

  def extractDateFromURL(url: String): Option[Date] = {
    def findYearMonthAndDay(segments: Array[String]): (Option[Int], Option[Int], Option[Int]) = {
      def findMonthAndDay(segments: Array[String]): (Option[Int], Option[Int]) = {
        def findDay(segment: String): Option[Int] = Try(segment.toInt).filter(d => d >= 1 && d <= 31).toOption
        Try(segments.head.toInt).filter(m => m >= 1 && m <= 12).map { month =>
          (Some(month), findDay(segments.tail.head))
        }.getOrElse((None, None))
      }

      if (segments.isEmpty)
        (None, None, None)
      else {
        Try(segments.head.toInt).filter(y => y > 1970 && y < 3000).map { year =>
          val (month, day) = findMonthAndDay(segments.tail)
          (Some(year), month, day)
        }.getOrElse(findYearMonthAndDay(segments.tail))
      }
    }

    val (year, month, day) = findYearMonthAndDay(url.split("/"))
    year.map { y =>
      val m = month.getOrElse(1)
      val d = day.getOrElse(1)
      new DateTime(y, m, d, 0, 0).toDate
    }
  }

  /**
  * we're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords
  * and the number of consecutive paragraphs together, which should form the cluster of text that this node is around
  * also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score
  */
  def calculateBestNodeBasedOnClustering(document: Document, lang:String): Option[Element] = {
    implicit val doc = document.clone

    val nodesToCheck = byTag("p") ++ byTag("td") ++ byTag("pre") ++ byTag("strong") ++ byTag("li") ++ byTag("code")

    val nodesWithText = nodesToCheck.filter { node =>
      val nodeText = node.text
      val wordStats = StopWords.stopWordCount(nodeText, lang)
      val highLinkDensity = isHighLinkDensity(node)
      logger.trace("Candidate: " + node.tagName() + " score: " + wordStats + " d:" + highLinkDensity + " text:" + nodeText)
      wordStats.stopWordCount > 2 && !highLinkDensity
    }

    val numberOfNodes = nodesWithText.size
    val bottomNodesForNegativeScore = numberOfNodes * 0.25

    logger.trace("About to inspect num of nodes with text: " + numberOfNodes)

    def boostScoreForNode(node: Element, startingBoost: Double, count: Int): (Double, Double) = {
      var newStartingBoost = startingBoost
      var result = 0.0
      if (isOkToBoost(node, lang)) {
        result = (1.0 / startingBoost) * 50
        newStartingBoost += 1
      }
      if (numberOfNodes > 15) {
        if ((numberOfNodes - count) <= bottomNodesForNegativeScore) {
          val booster = bottomNodesForNegativeScore - (numberOfNodes - count)
          result = -pow(booster, 2)
          if (abs(result) > 40)
            result = 5
        }
      }
      (newStartingBoost, result)
    }

    var count = 0
    var startingBoost: Double = 1.0
    val parentNodes = mutable.Set.empty[Element]

    for (node <- nodesWithText) {
      val (newStartingBoost, boostScore) = boostScoreForNode(node, startingBoost, count)
      startingBoost = newStartingBoost

      logger.trace("Location Boost Score: " + boostScore + " on interation: " + count + " tag='"+ node.tagName +"' id='" + node.parent.id + "' class='" + node.parent.attr("class"))

      val wordStats: WordStats = StopWords.stopWordCount(node.text, lang)
      val upscore: Int = (wordStats.stopWordCount + boostScore).toInt
      updateScore(node.parent, upscore)
      updateScore(node.parent.parent, upscore / 2)
      updateNodeCount(node.parent, 1)
      updateNodeCount(node.parent.parent, 1)
      parentNodes.add(node.parent)
      parentNodes.add(node.parent.parent)
      count += 1
    }

    if (parentNodes.isEmpty)
      None
    else {
      Some(parentNodes.maxBy(getScore)).filter(getScore(_) >= 20)
    }
  }

  /**
  * alot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to
  * boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs
  * so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it
  */
  private def isOkToBoost(node: Element, lang: String): Boolean = {
    var stepsAway: Int = 0
    val minimumStopWordCount = 5
    val maxStepsAwayFromNode = 3

    walkSiblings(node) { currentNode =>
      if (currentNode.tagName == "p" || currentNode.tagName == "strong") {
        if (stepsAway >= maxStepsAwayFromNode) {
          return false
        }
        val wordStats = StopWords.stopWordCount(currentNode.text, lang)
        if (wordStats.stopWordCount > minimumStopWordCount)
          return true
        stepsAway += 1
      }
    }
    false
  }

  private def getShortText(e: String, max: Int): String = if (e.length > max) e.take(max) + "..." else e

  /**
   * Checks the density of links within a node. If there's not much text and what's there is mostly links,
   * we're not interested
   */
  private def isHighLinkDensity(implicit e: Element): Boolean = {
    val limit = 1.0
    val links = byTag("a") ++ byAttr("onclick")

    if (links.isEmpty)
      false
    else {
      val words = e.text.trim.split("\\s+")
      val linkWords = links.mkString(" ").split("\\s+")
      val numberOfLinks = links.size
      val numberOfWords = words.length.toDouble
      val numberOfLinkWords = linkWords.length.toDouble
      val score = numberOfLinks * numberOfLinkWords / numberOfWords

      logger.trace("Calculated link density score as: {} for node: {}", score, getShortText(e.text, 50))

      score >= limit
    }
  }

  private def getScore(node: Element): Int = getGravityScoreFromNode(node).getOrElse(0)

  private def getGravityScoreFromNode(node: Element): Option[Int] = Try(node.attr("gravityScore").toInt).toOption

  /**
  * adds a score to the gravityScore Attribute we put on divs
  * we'll get the current score then add the score we're passing in to the current
  *
  * @param addToScore - the score to add to the node
  */
  private def updateScore(node: Element, addToScore: Int) {
    val currentScore = Try(node.attr("gravityScore").toInt).getOrElse(0)
    val newScore = currentScore + addToScore
    node.attr("gravityScore", newScore.toString)
  }

  /**
  * stores how many decent nodes are under a parent node
  */
  private def updateNodeCount(node: Element, addToCount: Int) {
    val currentScore = Try(node.attr("gravityNodes").toInt).getOrElse(0)
    val newScore: Int = currentScore + addToCount
    node.attr("gravityNodes", newScore.toString)
  }

  /**
   * pulls out links we like
   */
  def extractLinks(implicit node: Element): Seq[Link] =
    select("a[href]")
      .filter(el => el.attr("href") != "#" && !el.attr("abs:href").trim.isEmpty)
      .map(el => Link(el.text, el.attr("abs:href")))

  private def isTableTagAndNoParagraphsExist(implicit e: Element): Boolean = {
    getChildParagraphs(e).filter(_.text.length < 25).foreach(remove)

    val subParagraphs2 = byTag("p")
    if (subParagraphs2.isEmpty && e.tagName != "td") {
      if (e.tagName == "ul" || e.tagName == "ol") {
        val linkTextLength = byTag("a").map(_.text.length).sum
        val elementTextLength = e.text.length
        elementTextLength <= 2 * linkTextLength
      }
      else true
    } else false
  }

  /**
  * remove any divs that looks like non-content, clusters of links, or paras with no gusto
  */
  def postExtractionCleanup(targetNode: Element, lang: String): Element = {
    val node = addSiblings(targetNode, lang)
    JListWrapper(node.children)
      .filter(e => e.tagName != "p" || isHighLinkDensity(e))
      .filter(e => isHighLinkDensity(e) || isTableTagAndNoParagraphsExist(e) || !isNodeScoreThresholdMet(node, e))
      .foreach(remove)
    node
  }

  private def isNodeScoreThresholdMet(node: Element, e: Element): Boolean = {
    val topNodeScore = getScore(node)
    val currentNodeScore = getScore(e)
    val thresholdScore = topNodeScore * .08
    !(currentNodeScore < thresholdScore && e.tagName != "td")
  }

  private def getChildParagraphs(implicit e: Element): Seq[Element] = byTag("p") ++ byTag("strong")

  /**
  * adds any siblings that may have a decent score to this node
  */
  private def getSiblingContent(currentSibling: Element,
                        baselineScoreForSiblingParagraphs: Int,
                        lang: String): Option[String] = {
    if ((currentSibling.tagName == "p" || currentSibling.tagName == "strong") && currentSibling.text.nonEmpty)
      Some(currentSibling.outerHtml)
    else {
      val siblingBaseLineScore = baselineScoreForSiblingParagraphs * 0.3
      val text = getChildParagraphs(currentSibling)
        .filter(p => StopWords.stopWordCount(p.text, lang).stopWordCount >= siblingBaseLineScore)
        .map(p => "" + p.text + "")
        .mkString(" ")
      if (text.isEmpty) None else Some(text)
    }
  }

  private def walkSiblings[T](node: Element)(work: (Element) => T): Seq[T] = {
    var currentSibling = node.previousElementSibling
    val b = mutable.Buffer[T]()

    while (currentSibling != null) {
      b += work(currentSibling)
      currentSibling = currentSibling.previousElementSibling
    }
    b
  }

  private def addSiblings(topNode: Element, lang: String): Element = {
    val baselineScoreForSiblingParagraphs = getBaselineScoreForSiblings(topNode, lang)
    val results = walkSiblings(topNode) { currentNode =>
      getSiblingContent(currentNode, baselineScoreForSiblingParagraphs, lang)
    }.reverse.flatten
    topNode.child(0).before(results.mkString)
    topNode
  }

  /**
  * we could have long articles that have tons of paragraphs so if we tried to calculate the base score against
  * the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring
  * of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of
  * 100 then 100 should be our base.
  */
  private def getBaselineScoreForSiblings(topNode: Element, lang: String): Int = {
    val nodesToCheck = getChildParagraphs(topNode)

    val scores = nodesToCheck.flatMap { node =>
      val wordStats = StopWords.stopWordCount(node.text, lang)
      if (wordStats.stopWordCount > 2 && !isHighLinkDensity(node)) Some(wordStats.stopWordCount)
      else None
    }

    if (scores.nonEmpty) scores.sum / scores.length
    else Int.MaxValue
  }
}