All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mchange.feedletter.ItemContent.scala Maven / Gradle / Ivy

package com.mchange.feedletter

import java.time.Instant
import scala.xml.{Elem,Node,NodeSeq,XML}
import scala.util.{Success,Failure}
import audiofluidity.rss.util.*
import com.mchange.conveniences.collection.*
import com.mchange.conveniences.string.*
import com.mchange.sc.v1.log.*
import MLevel.*
import com.mchange.mailutil.Smtp
import cats.instances.try_
import scala.util.control.NonFatal

object ItemContent:
  private lazy given logger : MLogger = MLogger(this)

  private def whitespaceSignificant( elem : Elem ) : Boolean =
    elem.label match
      case "rss"|"channel"|"item"|"image" => false
      case _                              => true

  def fromRssGuid( rawRss : Elem, guid : String ) : ItemContent =
    val singleItemRss =
      audiofluidity.rss.util.singleItemRss( rawRss, guid, SkipUnstableChannelElements ) match
        case Left( s )     => throw new FeedletterException( s"Could not create single-item RSS: $s" )
        case Right( elem ) => elem
    val normalizedSingleItemRss = stripInsignificantWhitespaceRecursive( singleItemRss, whitespaceSignificant )
    new ItemContent( guid, normalizedSingleItemRss )

  def fromPrenormalizedSingleItemRss( guid : String, prenormalizedSingleItemRssStr : String ) : ItemContent =
    new ItemContent( guid, XML.loadString(prenormalizedSingleItemRssStr) )

  final case class Media( url : String, mimeType : Option[String], length : Option[Long], alt : Option[String] )

import ItemContent.Media

case class ItemContent private (
  val guid        : String,
  val rssElem     : Elem,
  overrideTitle   : Option[String]     = None,
  overrideAuthor  : Option[String]     = None,
  overrideArticle : Option[String]     = None,
  overridePubDate : Option[Instant]    = None,
  overrideLink    : Option[String]     = None,
  overrideMedia   : Option[Seq[Media]] = None
):
  import ItemContent.logger

  def withTitle( title : String )      : ItemContent = this.copy( overrideTitle = Some( title ) )
  def withAuthor( author : String )    : ItemContent = this.copy( overrideAuthor = Some( author ) )
  def withArticle( article : String )  : ItemContent = this.copy( overrideArticle = Some( article ) )
  def withPubDate( pubDate : Instant ) : ItemContent = this.copy( overridePubDate = Some( pubDate ) )
  def withLink( link : String )        : ItemContent = this.copy( overrideLink = Some( link ) )
  def withMedia( media : Seq[Media] )  : ItemContent = this.copy( overrideMedia = Some( media ) )

  val itemElem : Elem = (rssElem \ "channel" \ "item").uniqueOr { (ns : NodeSeq, nu : NotUnique) =>
    throw new AssertionError( s"ItemContent should only be initialized with single-item RSS, we found ${nu}." )
  }.asInstanceOf[Elem]

  lazy val title   : Option[String]  = overrideTitle   orElse extractTitle
  lazy val author  : Option[String]  = overrideAuthor  orElse extractCreatorAuthor
  lazy val article : Option[String]  = overrideArticle orElse extractContent
  lazy val pubDate : Option[Instant] = overridePubDate orElse extractPubDate
  lazy val link    : Option[String]  = overrideLink    orElse extractLink
  lazy val media   : Seq[Media]      = overrideMedia.getOrElse( extractMedia )

  def contentHash : Int = this.## // XXX: should I use a better, more guaranteed-stable hash?

  private def extractCreatorAuthor : Option[String] =
    def mbCreator = (itemElem \ "creator").filter(lenientDublinCoreNamespace).headOption.map( _.text.trim )
    def mbAuthor  = (itemElem \ "author").headOption.map( parseAuthorFromAuthorElem )
    mbCreator orElse mbAuthor

  private def extractContent =
    val contentElem =
      (itemElem \ "encoded").filter(lenientRdfContentNamespace).headOption orElse
      (itemElem \ "content").filter(lenientAtomNamespace).headOption orElse
      (itemElem \ "description").headOption
    contentElem.map( _.text.trim )

  private def extractTitle = (itemElem \ "title").headOption.map( _.text.trim )

  private def extractPubDate =
    (itemElem \ "pubDate").headOption.map( _.text.trim ).flatMap: dateStr =>
      attemptLenientParsePubDateToInstant( dateStr ) match
        case Success( instant ) => Some( instant )
        case Failure( t ) =>
          val title = extractTitle
          WARNING.log(s"""From '${title.getOrElse("(untitled)")}, could not parse pubDate '${dateStr}'.""", t )
          None

  private def extractLink =
    val guidIsPermalink = (itemElem \ "guid" \@ "isPermalink") == "true"
    def guidLink = if guidIsPermalink then (itemElem \ "guid").headOption else None
    def origLink = (itemElem \ "origLink").headOption // feedburner BS, but better than the indirection it otherwise embeds
    def straightLink = (itemElem \ "link").headOption
    (guidLink orElse origLink orElse straightLink).map( _.text.trim )

  private def extractMedia = mediaFromMrssContent ++ mediaFromEnclosure

  private def mediaFromMrssContent =
    val mediaContentElems = (itemElem \ "content").filter( lenientMediaNamespace )
    val maybes =
      mediaContentElems.map: elem =>
        val url = (elem \@ "url").toOptionNotBlank
        val mimeType = (elem \@ "type").toOptionNotBlank
        val length = (elem \@ "fileSize").toOptionNotBlank.map( _.toLong )
        val alt = (elem \ "description").headOption.map( _.text.trim )
        url.map( u => Media( u, mimeType, length, alt ) )
    maybes.actuals

  private def mediaFromEnclosure =
    val enclosureElems = (itemElem \ "enclosure")
    val maybes =
      enclosureElems.map: elem =>
        val url = (elem \@ "url").toOptionNotBlank
        val mimeType = (elem \@ "type").toOptionNotBlank
        val length = (elem \@ "length").toOptionNotBlank.map( _.toLong )
        url.map( u => Media( u, mimeType, length, None ) )
    maybes.actuals

  private def parseAuthorFromAuthorElem( authorElem : Node ) : String =
    def fromAddress( a : Smtp.Address ) : String = a.displayName.getOrElse( a.email )
    val text = authorElem.text.trim
    try
      val addresses = Smtp.Address.parseCommaSeparated(text)
      val names = addresses.map( fromAddress )
      names.size match
        case 1 => names.head
        case n => (names.init :+ "and ${names.last}").mkString(", ")
    catch
      case NonFatal( t ) => text

  private def lenientRdfContentNamespace(node : Node ) : Boolean =
    node.namespace.contains("://purl.org/rss/1.0/modules/content")

  private def lenientDublinCoreNamespace(node : Node ) : Boolean =
    node.namespace.contains("://purl.org/dc/elements/1.1")

  private def lenientAtomNamespace(node : Node ) : Boolean =
    node.namespace.contains("://www.w3.org/2005/Atom")

  private def lenientMediaNamespace(node : Node ) : Boolean =
    node.namespace.contains("://search.yahoo.com/mrss")





© 2015 - 2025 Weber Informatics LLC | Privacy Policy