net.dankito.readability4j.processor.Preprocessor.kt Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of Readability4J Show documentation
A Kotlin port of Mozilla‘s Readability. It extracts a website‘s relevant content and removes all clutter from it.
The newest version!
package net.dankito.readability4j.processor

import net.dankito.readability4j.util.RegExUtil
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.nodes.Node
import org.slf4j.LoggerFactory

/**
 * Performs basic sanitization before starting the extraction process.
 */
open class Preprocessor(protected val regEx: RegExUtil = RegExUtil()) : ProcessorBase() {

    companion object {
        private val log = LoggerFactory.getLogger(Preprocessor::class.java)
    }


    /**
     * Prepare the HTML document for readability to scrape it.
     * This includes things like stripping javascript, CSS, and handling terrible markup.
     */
    open fun prepareDocument(document: Document) {
        log.debug("Starting to prepare document")

        removeScripts(document)

        removeStyles(document)

        removeForms(document) // TODO: this is not in Mozilla's Readability

        removeComments(document) // TODO: this is not in Mozilla's Readability

        replaceBrs(document, regEx)

        replaceNodes(document, "font", "span")
    }


    protected open fun removeScripts(document: Document) {
        removeNodes(document, "script") { scriptNode ->
            scriptNode.`val`(null) // TODO: what is this good for?
            scriptNode.removeAttr("src")
            true
        }
//        element.getElementsByTag("script").forEach { script ->
//            printAndRemove(log, script, "removeScripts('script')")
//        }

        document.getElementsByTag("noscript").forEach { noscript ->
            if(shouldKeepImageInNoscriptElement(document, noscript)) { // TODO: this is not in Mozilla's Readability
                noscript.unwrap()
            }
            else {
                printAndRemove(noscript, "removeScripts('noscript')")
            }
        }
    }

    protected open fun shouldKeepImageInNoscriptElement(document: Document, noscript: Element): Boolean {
        val images = noscript.select("img")
        if(images.size > 0) {
            val imagesToKeep = ArrayList(images)

            images.forEach { image ->
                if(document.select("img[src=${image.attr("src")}]").size > 0) {
                    imagesToKeep.remove(image)
                }
            }

            return imagesToKeep.size > 0
        }

        return false
    }

    protected open fun removeStyles(document: Document) {
        removeNodes(document, "style")
    }

    protected open fun removeForms(document: Document) {
        removeNodes(document, "form")
    }

    protected open fun removeComments(node: Node) {
        var i = 0
        while (i < node.childNodeSize()) {
            val child = node.childNode(i)
            if(child.nodeName() == "#comment") {
                printAndRemove(child, "removeComments")
            }
            else {
                removeComments(child)
                i++
            }
        }
    }


    /**
     * Replaces 2 or more successive 
 elements with a single .
     * Whitespace between 
 elements are ignored. For example:
     *   
foo
bar
 

abc
     * will become:
     *   foo
barabc
     */
    protected open fun replaceBrs(document: Document, regEx: RegExUtil) {
        document.body().select("br").forEach { br ->
            var next: Node? = br.nextSibling()

            // Whether 2 or more 
 elements have been found and replaced with a
            //  block.
            var replaced = false

            // If we find a 
 chain, remove the 
s until we hit another element
            // or non-whitespace. This leaves behind the first 
 in the chain
            // (which will be replaced with a 
 later).
            next = nextElement(next, regEx)
            while(next != null && next.nodeName() == "br") {
                replaced = true
                val brSibling = (next as? Element)?.nextSibling()
                printAndRemove(next, "replaceBrs")
                next = nextElement(brSibling, regEx)
            }

            // If we removed a 
 chain, replace the remaining 
 with a 
. Add
            // all sibling nodes as children of the 
 until we hit another 

            // chain.
            if(replaced) {
                val p = br.ownerDocument().createElement("p")
                br.replaceWith(p)

                next = p.nextSibling()
                while(next != null) {
                    // If we've hit another 

, we're done adding children to this 
.
                    if(next.nodeName() == "br") {
                        val nextElem = this.nextElement(next, regEx)
                        if(nextElem != null && nextElem.tagName() == "br")
                            break
                    }

                    // Otherwise, make this node a child of the new .
                    val sibling = next.nextSibling()
                    p.appendChild(next)
                    next = sibling
                }
            }
        }
    }

}