All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.dankito.readability4j.extended.processor.PostprocessorExtended.kt Maven / Gradle / Ivy

Go to download

A Kotlin port of Mozilla‘s Readability. It extracts a website‘s relevant content and removes all clutter from it.

The newest version!
package net.dankito.readability4j.extended.processor

import net.dankito.readability4j.processor.Postprocessor
import org.jsoup.nodes.Attributes
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.parser.Tag


open class PostprocessorExtended : Postprocessor() {

    override fun postProcessContent(originalDocument: Document, articleContent: Element, articleUri: String, additionalClassesToPreserve: Collection) {
        // call these methods before super.postProcessContent() so that afterwards relative urls are made absolute
        makeLazyLoadingUrlsEagerLoading(articleContent)

        fixAmpImageUris(articleContent)

        super.postProcessContent(originalDocument, articleContent, articleUri, additionalClassesToPreserve)
    }


    protected open fun makeLazyLoadingUrlsEagerLoading(articleContent: Element) {
        articleContent.select("img").forEach { imgElement ->
            makeLazyLoadingUrlEagerLoading(imgElement, "src",
                    listOf("data-src", "data-original", "data-actualsrc", "data-lazy-src", "data-delayed-url",
                            "data-li-src", "data-pagespeed-lazy-src"))
        }
    }

    protected open fun makeLazyLoadingUrlEagerLoading(element: Element, attributeToSet: String, lazyLoadingAttributes: List) {
        lazyLoadingAttributes.forEach { lazyLoadingAttributeName ->
            val value = element.attr(lazyLoadingAttributeName)

            if (value.isNotBlank()) { // .attr() by default returns an empty string
                element.attr(attributeToSet, value)

                return // only set first found lazy loading attribute
            }
        }
    }

    protected open fun fixAmpImageUris(element: Element) {
        element.getElementsByTag("amp-img").forEach { amp_img ->

            if (amp_img.childNodeSize() == 0) {
                val attributes = Attributes()
                attributes.put("decoding", "async")
                attributes.put("alt", amp_img.attr("alt"))
                attributes.put("srcset", amp_img.attr("srcset").trim())

                amp_img.appendChild(Element(Tag.valueOf("img"), "", attributes))
            }
        }
    }


    override fun fixRelativeUris(originalDocument: Document, element: Element, scheme: String, prePath: String,
								 pathBase: String) {

        val baseUrl = originalDocument.head().select("base").first()?.attr("href")

        if (baseUrl != null) { // if a base URL is specified use that one
            super.fixRelativeUris(originalDocument, element, scheme, prePath, baseUrl)
        }
        else {
            super.fixRelativeUris(originalDocument, element, scheme, prePath, pathBase)
        }
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy