All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.dankito.readability4j.processor.ArticleGrabber.kt Maven / Gradle / Ivy

Go to download

A Kotlin port of Mozilla‘s Readability. It extracts a website‘s relevant content and removes all clutter from it.

There is a newer version: 1.0.8
Show newest version
package net.dankito.readability4j.processor

import net.dankito.readability4j.model.ArticleGrabberOptions
import net.dankito.readability4j.model.ArticleMetadata
import net.dankito.readability4j.model.ReadabilityObject
import net.dankito.readability4j.model.ReadabilityOptions
import net.dankito.readability4j.util.RegExUtil
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
import org.slf4j.LoggerFactory
import java.util.*
import kotlin.collections.ArrayList
import kotlin.collections.HashMap


open class ArticleGrabber(protected val options: ReadabilityOptions, protected val regEx: RegExUtil = RegExUtil()) : ProcessorBase() {

    companion object {
        // Element tags to score by default.
        val DEFAULT_TAGS_TO_SCORE = Arrays.asList("section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre")
        
        
        val DIV_TO_P_ELEMS = Arrays.asList("a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul", "select")

        val ALTER_TO_DIV_EXCEPTIONS = Arrays.asList("div", "article", "section", "p")

        val PRESENTATIONAL_ATTRIBUTES = Arrays.asList("align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace")

        val DEPRECATED_SIZE_ATTRIBUTE_ELEMS = Arrays.asList("table", "th", "td", "hr", "pre")
        
        val EMBEDDED_NODES = Arrays.asList("object", "embed", "iframe")

        val DATA_TABLE_DESCENDANTS = Arrays.asList("col", "colgroup", "tfoot", "thead", "th")


        private val log = LoggerFactory.getLogger(ArticleGrabber::class.java)
    }


    var articleByline: String? = null
        protected set

    var articleDir: String? = null
        protected set


    protected val nbTopCandidates = options.nbTopCandidates
    protected val wordThreshold = options.wordThreshold

    protected val readabilityObjects = HashMap()

    protected val readabilityDataTable = HashMap()


    open fun grabArticle(doc: Document, metadata: ArticleMetadata, options: ArticleGrabberOptions = ArticleGrabberOptions(), pageElement: Element? = null): Element? {
        log.debug("**** grabArticle ****")

        val isPaging = pageElement != null
        val page = pageElement ?: doc.body()

        // We can't grab an article if we don't have a page!
        if(page == null) {
            log.debug("No body found in document. Abort.")
            return null
        }

        val pageCacheHtml = doc.html()

        while(true) {
            // First, node prepping. Trash nodes that look cruddy (like ones with the
            // class name "comment", etc), and turn divs into P tags where they have been
            // used inappropriately (as in, where they contain no other block level elements.)
            val elementsToScore = prepareNodes(doc, options)

            /**
             * Loop through all paragraphs, and assign a score to them based on how content-y they look.
             * Then add their score to their parent node.
             *
             * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
             **/
            val candidates = scoreElements(elementsToScore, options)

            // After we've calculated scores, loop through all of the possible
            // candidate nodes we found and find the one with the highest score.
            val topCandidateResult = getTopCandidate(page, candidates, options)
            val topCandidate = topCandidateResult.first
            val neededToCreateTopCandidate= topCandidateResult.second

            // Now that we have the top candidate, look through its siblings for content
            // that might also be related. Things like preambles, content split by ads
            // that we removed, etc.
            var articleContent = createArticleContent(doc, topCandidate, isPaging)


            log.debug("Article content pre-prep: {}", articleContent.html())
            // So we have all of the content that we need. Now we clean it up for presentation.
            prepArticle(articleContent, options, metadata)
            log.debug("Article content post-prep: {}", articleContent.html())

            if(neededToCreateTopCandidate) {
                // We already created a fake div thing, and there wouldn't have been any siblings left
                // for the previous loop, so there's no point trying to create a new div, and then
                // move all the children over. Just assign IDs and class names here. No need to append
                // because that already happened anyway.
                topCandidate.attr("id", "readability-page-1")
                topCandidate.addClass("page")
            }
            else {
                val div = doc.createElement("div")
                div.attr("id", "readability-page-1")
                div.addClass("page")

                ArrayList(articleContent.childNodes()).forEach { child ->
                    child.remove()
                    div.appendChild(child)
                }

                articleContent.appendChild(div)
            }

            log.debug("Article content after paging: {}", articleContent.html())

            var parseSuccessful = true
            val attempts = ArrayList>()

            // Now that we've gone through the full algorithm, check to see if
            // we got any meaningful content. If we didn't, we may need to re-run
            // grabArticle with different flags set. This gives us a higher likelihood of
            // finding the content, and the sieve approach gives us a higher likelihood of
            // finding the -right- content.
            val textLength = getInnerText(articleContent, regEx, true).length
            if(textLength < this.wordThreshold) {
                parseSuccessful = false
                page.html(pageCacheHtml)

                if(options.stripUnlikelyCandidates) {
                    options.stripUnlikelyCandidates = false
                    attempts.add(Pair(articleContent, textLength))
                }
                else if(options.weightClasses) {
                    options.weightClasses = false
                    attempts.add(Pair(articleContent, textLength))
                }
                else if(options.cleanConditionally) {
                    options.cleanConditionally = false
                    attempts.add(Pair(articleContent, textLength))
                }
                else {
                    attempts.add(Pair(articleContent, textLength))
                    // No luck after removing flags, just return the longest text we found during the different loops
                    attempts.sortBy { it.second }

                    // But first check if we actually have something
                    if (attempts.isEmpty() || attempts[0].second <= 0) {
                        return null
                    }

                    articleContent = attempts[0].first
                    parseSuccessful = true
                }
            }

            if(parseSuccessful) {
                // Find out text direction from ancestors of final top candidate.
                getTextDirection(topCandidate, doc)

                return articleContent
            }
        }
    }


    /*             First step: prepare nodes           */

    protected open fun prepareNodes(doc: Document, options: ArticleGrabberOptions): List {
        val elementsToScore = ArrayList()
        var node: Element? = doc

        while(node != null) {
            val matchString = node.className() + " " + node.id()

            // Check to see if this node is a byline, and remove it if it is.
            if(checkByline(node, matchString)) {
                node = removeAndGetNext(node, "byline")
                continue
            }

            // Remove unlikely candidates
            if(options.stripUnlikelyCandidates) {
                if(regEx.isUnlikelyCandidate(matchString) &&
                        regEx.okMaybeItsACandidate(matchString) == false &&
                        node.tagName() != "body" &&
                        node.tagName() != "a") {
                    node = this.removeAndGetNext(node, "Removing unlikely candidate")
                    continue
                }
            }

            // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
            if((node.tagName() == "div" || node.tagName() == "section" || node.tagName() == "header" ||
                    node.tagName() == "h1" || node.tagName() == "h2" || node.tagName() == "h3" ||
                    node.tagName() == "h4" || node.tagName() == "h5" || node.tagName() == "h6") &&
                    this.isElementWithoutContent(node)) {
                node = this.removeAndGetNext(node, "node without content")
                continue
            }

            if(DEFAULT_TAGS_TO_SCORE.contains(node.tagName())) {
                elementsToScore.add(node)
            }

            // Turn all divs that don't have children block level elements into p's
            if(node.tagName() == "div") {
                // Sites like http://mobile.slate.com encloses each paragraph with a DIV
                // element. DIVs with only a P element inside and no text content can be
                // safely converted into plain P elements to avoid confusing the scoring
                // algorithm with DIVs with are, in practice, paragraphs.
                if(this.hasSinglePInsideElement(node)) {
                    val newNode = node.child(0)
                    node.replaceWith(newNode)
                    node = newNode
                    elementsToScore.add(node)
                }
                else if(!this.hasChildBlockElement(node)) {
                    setNodeTag(node, "p")
                    elementsToScore.add(node)
                }
                else {
                    // EXPERIMENTAL
                    node.childNodes().forEach { childNode ->
                        if(childNode is TextNode && childNode.text().trim().length > 0) {
                            val p = doc.createElement("p")
                            p.text(childNode.text())
                            p.attr("style", "display: inline;")
                            p.addClass("readability-styled")
                            childNode.replaceWith(p)
                        }
                    }
                }
            }

            node = if(node != null) this.getNextNode(node) else null
        }

        return elementsToScore
    }


    protected open fun checkByline(node: Element, matchString: String): Boolean {
        if(this.articleByline != null) {
            return false
        }

        val rel = node.attr("rel")

        if((rel == "author" || regEx.isByline(matchString)) && isValidByline(node.wholeText())) {
            this.articleByline = node.text().trim()
            return true
        }

        return false
    }

    /**
     * Check whether the input string could be a byline.
     * This verifies that the input is a string, and that the length
     * is less than 100 chars.
     */
    protected open fun isValidByline(text: String): Boolean {
        val byline = text.trim()

        return (byline.isNotEmpty()) && (byline.length < 100)
    }


    protected open fun isElementWithoutContent(node: Element): Boolean {
        return node.text().isBlank() &&
                (node.children().size == 0 ||
                 node.children().size == node.getElementsByTag("br").size + node.getElementsByTag("hr").size)
    }


    /**
     * Check if this node has only whitespace and a single P element
     * Returns false if the DIV node contains non-empty text nodes
     * or if it contains no P or more than 1 element.
     */
    protected open fun hasSinglePInsideElement(element: Element): Boolean {
        // There should be exactly 1 element child which is a P:
        if(element.children().size != 1 || element.child(0).tagName() != "p") {
            return false
        }

        // And there should be no text nodes with real content
        element.childNodes().forEach { node ->
            if(node is TextNode && regEx.hasContent(node.text())) {
                return false
            }
        }

        return true
    }

    /**
     * Determine whether element has any children block level elements.
     */
    protected open fun hasChildBlockElement(element: Element): Boolean {
        element.children().forEach { node ->
            if(DIV_TO_P_ELEMS.contains(node.tagName()) || hasChildBlockElement(node)) {
                return true
            }
        }

        return false
    }

    protected open fun setNodeTag(node: Element, tagName: String) {
        node.tagName(tagName)
    }



    /*          Second step: Score elements             */

    protected open fun scoreElements(elementsToScore: List, options: ArticleGrabberOptions): List {
        val candidates = ArrayList()

        elementsToScore.forEach { elementToScore ->
            if(elementToScore.parentNode() == null) {
                return@forEach
            }

            // If this paragraph is less than 25 characters, don't even count it.
            val innerText = this.getInnerText(elementToScore, regEx)
            if(innerText.length < 25) {
                return@forEach
            }

            // Exclude nodes with no ancestor.
            val ancestors = this.getNodeAncestors(elementToScore, 3)
            if(ancestors.size == 0) {
                return@forEach
            }

            var contentScore = 0.0

            // Add a point for the paragraph itself as a base.
            contentScore += 1

            // Add points for any commas within this paragraph.
            contentScore += innerText.split(',').size

            // For every 100 characters in this paragraph, add another point. Up to 3 points.
            contentScore += Math.min(Math.floor(innerText.length / 100.0), 3.0)

            // Initialize and score ancestors.
            for(level in 0..ancestors.size - 1) {
                val ancestor = ancestors[level]
                if(ancestor.tagName().isNullOrBlank()) { // with Jsoup this should never be true as we're only handling Elements
                    return@forEach
                }

                if(getReadabilityObject(ancestor) == null) {
                    candidates.add(ancestor)
                    initializeNode(ancestor, options)
                }

                // Node score divider:
                // - parent:             1 (no division)
                // - grandparent:        2
                // - great grandparent+: ancestor level * 3
                val scoreDivider =
                        if(level == 0)
                            1
                        else if(level == 1)
                            2
                        else
                            level * 3

                getReadabilityObject(ancestor)?.let { readability ->
                    readability.contentScore += contentScore / scoreDivider.toDouble()
                }
            }
        }

        return candidates
    }

    /**
     * Initialize a node with the readability object. Also checks the
     * className/id for special names to add to its score.
     */
    protected open fun initializeNode(node: Element, options: ArticleGrabberOptions): ReadabilityObject {
        val readability = ReadabilityObject(0.0)
        readabilityObjects.put(node, readability)

        when(node.tagName()) {
            "div" ->
                readability.contentScore += 5

            "pre",
            "td",
            "blockquote" ->
                readability.contentScore += 3

            "address",
            "ol",
            "ul",
            "dl",
            "dd",
            "dt",
            "li",
            "form" ->
                readability.contentScore -= 3

            "h1",
            "h2",
            "h3",
            "h4",
            "h5",
            "h6",
            "th" ->
                readability.contentScore -= 5
        }

        readability.contentScore += getClassWeight(node, options)

        return readability
    }

    /**
     * Get an elements class/id weight. Uses regular expressions to tell if this
     * element looks good or bad.
     */
    protected open fun getClassWeight(e: Element, options: ArticleGrabberOptions): Int {
        if(options.weightClasses == false) {
            return 0
        }

        var weight = 0

        // Look for a special classname
        if(e.className().isNotBlank()) {
            if(regEx.isNegative(e.className())) {
                weight -= 25
            }

            if(regEx.isPositive(e.className())) {
                weight += 25
            }
        }

        // Look for a special ID
        if(e.id().isNotBlank()) {
            if(regEx.isNegative(e.id())) {
                weight -= 25
            }

            if(regEx.isPositive(e.id())) {
                weight += 25
            }
        }

        return weight
    }

    protected open fun getNodeAncestors(node: Element, maxDepth: Int = 0): List {
        var i = 0
        val ancestors = ArrayList()
        var next = node

        while(next.parent() != null) {
            ancestors.add(next.parent())
            if(++i == maxDepth) {
                break
            }

            next = next.parent()
        }

        return ancestors
    }



    /*          Third step: Get top candidate           */
    
    protected open fun getTopCandidate(page: Element, candidates: List, options: ArticleGrabberOptions): Pair {
        val topCandidates = ArrayList()

        candidates.forEach { candidate ->
            getReadabilityObject(candidate)?.let { readability ->
                // Scale the final candidates score based on link density. Good content
                // should have a relatively small link density (5% or less) and be mostly
                // unaffected by this operation.
                val candidateScore = readability.contentScore * (1 - this.getLinkDensity(candidate))
                readability.contentScore = candidateScore

                log.debug("Candidate: {} with score {}", candidate, candidateScore)

                for(t in 0..nbTopCandidates - 1) {
                    val aTopCandidate = if(topCandidates.size > t) topCandidates[t] else null
                    val topCandidateReadability = if(aTopCandidate != null) getReadabilityObject(aTopCandidate) else null

                    if(aTopCandidate == null || (topCandidateReadability != null && candidateScore > topCandidateReadability.contentScore)) {
                        topCandidates.add(t, candidate)

                        if(topCandidates.size > this.nbTopCandidates) {
                            topCandidates.removeAt(nbTopCandidates)
                        }
                        break
                    }
                }
            }
        }

        var topCandidate = if(topCandidates.size > 0) topCandidates[0] else null
        var parentOfTopCandidate: Element?

        // If we still have no top candidate, just use the body as a last resort.
        // We also have to copy the body node so it is something we can modify.
        if(topCandidate == null || topCandidate.tagName() == "body") {
            // Move all of the page's children into topCandidate
            topCandidate = Element("div")
            // Move everything (not just elements, also text nodes etc.) into the container
            // so we even include text directly in the body:
            ArrayList(page.childNodes()).forEach { child ->
                log.debug("Moving child out: {}", child)
                child.remove()
                topCandidate?.appendChild(child)
            }

            page.appendChild(topCandidate)

            this.initializeNode(topCandidate, options)

            return Pair(topCandidate, true)
        }
        else {
            // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
            // and whose scores are quite closed with current `topCandidate` node.
            val alternativeCandidateAncestors = ArrayList>()

            getReadabilityObject(topCandidate)?.let { topCandidateReadability ->
                topCandidates.filter { it != topCandidate }.forEach { otherTopCandidate ->
                    if(((getReadabilityObject(otherTopCandidate)?.contentScore ?: 0.0) / topCandidateReadability.contentScore) >= 0.75) {
                        alternativeCandidateAncestors.add(this.getNodeAncestors(otherTopCandidate))
                    }
                }
            }


            val MINIMUM_TOPCANDIDATES = 3
            if(alternativeCandidateAncestors.size >= MINIMUM_TOPCANDIDATES) {
                parentOfTopCandidate = topCandidate.parent()

                while(parentOfTopCandidate != null && parentOfTopCandidate.tagName() !== "body") {
                    var listsContainingThisAncestor = 0
                    var ancestorIndex = 0
                    while(ancestorIndex < alternativeCandidateAncestors.size && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES) {
                        if(alternativeCandidateAncestors[ancestorIndex].contains(parentOfTopCandidate)) {
                            listsContainingThisAncestor++
                        }
                        ancestorIndex++
                    }

                    if(listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
                        topCandidate = parentOfTopCandidate
                        break
                    }
                    parentOfTopCandidate = parentOfTopCandidate.parent()
                }
            }

            topCandidate = topCandidate!!
            if(getReadabilityObject(topCandidate) == null) {
                this.initializeNode(topCandidate, options)
            }

            // Because of our bonus system, parents of candidates might have scores
            // themselves. They get half of the node. There won't be nodes with higher
            // scores than our topCandidate, but if we see the score going *up* in the first
            // few steps up the tree, that's a decent sign that there might be more content
            // lurking in other places that we want to unify in. The sibling stuff
            // below does some of that - but only if we've looked high enough up the DOM
            // tree.
            parentOfTopCandidate = topCandidate.parent()
            var lastScore = getReadabilityObject(topCandidate)?.contentScore ?: 0.0
            // The scores shouldn't get too low.
            val scoreThreshold = lastScore / 3.0

            while(parentOfTopCandidate != null && parentOfTopCandidate.tagName() != "body") {
                val parentOfTopCandidateReadability = getReadabilityObject(parentOfTopCandidate)
                if(parentOfTopCandidateReadability == null) {
                    parentOfTopCandidate = parentOfTopCandidate.parent()
                    continue
                }

                val parentScore = parentOfTopCandidateReadability.contentScore
                if(parentScore < scoreThreshold) {
                    break
                }
                if(parentScore > lastScore) {
                    // Alright! We found a better parent to use.
                    topCandidate = parentOfTopCandidate
                    break
                }

                lastScore = parentOfTopCandidateReadability.contentScore
                parentOfTopCandidate = parentOfTopCandidate.parent()
            }

            // If the top candidate is the only child, use parent instead. This will help sibling
            // joining logic when adjacent content is actually located in parent's sibling node.
            topCandidate = topCandidate!!
            parentOfTopCandidate = topCandidate.parent()
            while(parentOfTopCandidate != null && parentOfTopCandidate.tagName() != "body" && parentOfTopCandidate.children().size == 1) {
                topCandidate = parentOfTopCandidate
                parentOfTopCandidate = topCandidate.parent()
            }

            topCandidate = topCandidate!!
            if(getReadabilityObject(topCandidate) == null) {
                this.initializeNode(topCandidate, options)
            }

            return Pair(topCandidate, false)
        }
    }

    /**
     * Get the density of links as a percentage of the content
     * This is the amount of text that is inside a link divided by the total text in the node.
     */
    protected open fun getLinkDensity(element: Element): Double {
        val textLength = this.getInnerText(element, regEx).length
        if(textLength == 0) {
            return 0.0
        }

        var linkLength = 0

        // XXX implement _reduceNodeList?
        element.getElementsByTag("a").forEach { linkNode ->
            linkLength += this.getInnerText(linkNode, regEx).length
        }

        return linkLength / textLength.toDouble()
    }



    /*          Forth step: Create articleContent           */

    protected open fun createArticleContent(doc: Document, topCandidate: Element, isPaging: Boolean): Element {
        val articleContent = doc.createElement("div")
        if(isPaging) {
            articleContent.attr("id", "readability-content")
        }

        val topCandidateReadability = getReadabilityObject(topCandidate)
        if(topCandidateReadability == null) {
            return articleContent
        }

        val siblingScoreThreshold = Math.max(10.0, topCandidateReadability.contentScore * 0.2)
        // Keep potential top candidate's parent node to try to get text direction of it later.
        val parentOfTopCandidate = topCandidate.parent()

        ArrayList(parentOfTopCandidate.children()).forEach { sibling -> // make a copy of children as the may get modified below -> we can get rid of s -= 1 sl -= 1 compared to original source
            var append = false

            val siblingReadability = getReadabilityObject(sibling)
            log.debug("Looking at sibling node: {} with score {}", sibling, siblingReadability?.contentScore ?: 0)
            log.debug("Sibling has score {}", siblingReadability?.contentScore?.toString() ?: "Unknown")

            if(sibling == topCandidate) {
                append = true
            }
            else {
                var contentBonus = 0.0

                // Give a bonus if sibling nodes and top candidates have the example same classname
                if(sibling.className() == topCandidate.className() && topCandidate.className() !== "")
                    contentBonus += topCandidateReadability.contentScore * 0.2

                if(siblingReadability != null &&
                        ((siblingReadability.contentScore + contentBonus) >= siblingScoreThreshold)) {
                    append = true
                }
                else if(shouldKeepSibling(sibling)) {
                    val linkDensity = this.getLinkDensity(sibling)
                    val nodeContent = this.getInnerText(sibling, regEx)
                    val nodeLength = nodeContent.length

                    if(nodeLength > 80 && linkDensity < 0.25) {
                        append = true
                    }
                    else if(nodeLength < 80 && nodeLength > 0 && linkDensity == 0.0 &&
                            nodeContent.contains("\\.( |$)".toRegex())) {
                        append = true
                    }
                }
            }

            if(append) {
                log.debug("Appending node: {}", sibling)

                if(ALTER_TO_DIV_EXCEPTIONS.contains(sibling.tagName()) == false) {
                    // We have a node that isn't a common block level element, like a form or td tag.
                    // Turn it into a div so it doesn't get filtered out later by accident.
                    log.debug("Altering sibling: {} to div.", sibling)

                    setNodeTag(sibling, "div")
                }

                articleContent.appendChild(sibling)
            }
        }

        return articleContent
    }

    protected open fun shouldKeepSibling(sibling: Element): Boolean {
        return sibling.tagName() == "p"
    }



    /*          Fifth step: Prepare article            */

    /**
     * Prepare the article node for display. Clean out any inline styles,
     * iframes, forms, strip extraneous 

tags, etc. */ protected open fun prepArticle(articleContent: Element, options: ArticleGrabberOptions, metadata: ArticleMetadata) { this.cleanStyles(articleContent) // Check for data tables before we continue, to avoid removing items in // those tables, which will often be isolated even though they're // visually linked to other content-ful elements (text, images, etc.). markDataTables(articleContent) // Clean out junk from the article content this.cleanConditionally(articleContent, "form", options) this.cleanConditionally(articleContent, "fieldset", options) this.clean(articleContent, "object") this.clean(articleContent, "embed") this.clean(articleContent, "h1") this.clean(articleContent, "footer") this.clean(articleContent, "link") // Clean out elements have "share" in their id/class combinations from final top candidates, // which means we don't remove the top candidates even they have "share". val shareRegex = "share".toRegex() articleContent.children().forEach { topCandidate -> cleanMatchedNodes(topCandidate, shareRegex) } // If there is only one h2 and its text content substantially equals article title, // they are probably using it as a header and not a subheader, // so remove it since we already extract the title separately. val h2 = articleContent.getElementsByTag("h2") if (h2.size == 1) { metadata.title?.let { articleTitle -> if(articleTitle.length > 0) { val lengthSimilarRate = (h2[0].text().length - articleTitle.length) / articleTitle.length.toFloat() if (Math.abs(lengthSimilarRate) < 0.5) { val titlesMatch = if(lengthSimilarRate > 0) { h2[0].text().contains(articleTitle) } else { articleTitle.contains(h2[0].text()) } if(titlesMatch) { this.clean(articleContent, "h2") } } } } } this.clean(articleContent, "iframe") this.clean(articleContent, "input") this.clean(articleContent, "textarea") this.clean(articleContent, "select") this.clean(articleContent, "button") this.cleanHeaders(articleContent, options) // Do these last as the previous stuff may have removed junk // that will affect these this.cleanConditionally(articleContent, "table", options) this.cleanConditionally(articleContent, "ul", options) this.cleanConditionally(articleContent, "div", options) // Remove extra paragraphs removeNodes(articleContent, "p") { paragraph -> val imgCount = paragraph.getElementsByTag("img").size val embedCount = paragraph.getElementsByTag("embed").size val objectCount = paragraph.getElementsByTag("object").size // At this point, nasty iframes have been removed, only remain embedded video ones. val iframeCount = paragraph.getElementsByTag("iframe").size val totalCount = imgCount + embedCount + objectCount + iframeCount return@removeNodes totalCount == 0 && getInnerText(paragraph, normalizeSpaces = false).length == 0 } articleContent.select("br").forEach { br -> val next = nextElement(br.nextSibling(), regEx) if(next != null && next.tagName() == "p") { br.remove() } } } /** * Remove the style attribute on every e and under. * TODO: Test if getElementsByTagName(*) is faster. */ protected open fun cleanStyles(e: Element) { if(e.tagName() == "svg") { return } if(e.className() !== "readability-styled") { // Remove `style` and deprecated presentational attributes PRESENTATIONAL_ATTRIBUTES.forEach { attributeName -> e.removeAttr(attributeName) } if(DEPRECATED_SIZE_ATTRIBUTE_ELEMS.contains(e.tagName())) { e.removeAttr("width") e.removeAttr("height") } } e.children().forEach { child -> cleanStyles(child) } } protected open fun markDataTables(root: Element) { root.getElementsByTag("table").forEach outer@ { table -> val role = table.attr("role") if(role == "presentation") { setReadabilityDataTable(table, false) return@outer } val datatable = table.attr("datatable") if(datatable == "0") { setReadabilityDataTable(table, false) return@outer } val summary = table.attr("summary") if(summary.isNotBlank()) { setReadabilityDataTable(table, true) return@outer } val caption = table.getElementsByTag("caption") if(caption.size > 0 && caption[0].childNodeSize() > 0) { setReadabilityDataTable(table, true) return@outer } // If the table has a descendant with any of these tags, consider a data table: (move to DATA_TABLE_DESCENDANTS to make code a more readable and a bit faster) DATA_TABLE_DESCENDANTS.forEach { tag -> if(table.getElementsByTag(tag).size > 0) { log.debug("Data table because found data-y descendant") setReadabilityDataTable(table, true) return@outer } } // Nested tables indicate a layout table: if(table.getElementsByTag("table").size > 0) { setReadabilityDataTable(table, false) return@outer } val sizeInfo = getRowAndColumnCount(table) if (sizeInfo.first >= 10 || sizeInfo.second > 4) { setReadabilityDataTable(table, true) return@outer } // Now just go by size entirely: setReadabilityDataTable(table, sizeInfo.first * sizeInfo.second > 10) } } /** * Return an object indicating how many rows and columns this table has. */ protected open fun getRowAndColumnCount(table: Element): Pair { var rows = 0 var columns = 0 val trs = table.getElementsByTag("tr") trs.forEach { tr -> rows += try { tr.attr("rowspan").toInt() } catch(ignored: Exception) { 1 } // Now look for column-related info var columnsInThisRow = 0 tr.getElementsByTag("td").forEach { cell -> columnsInThisRow += try { cell.attr("colspan").toInt() } catch(ignored: Exception) { 1 } } columns = Math.max(columns, columnsInThisRow) } return Pair(rows, columns) } protected open fun cleanConditionally(e: Element, tag: String, options: ArticleGrabberOptions) { if(options.cleanConditionally == false) return val isList = tag == "ul" || tag == "ol" // Gather counts for other typical elements embedded within. // Traverse backwards so we can remove nodes at the same time // without effecting the traversal. // // TODO: Consider taking into account original contentScore here. removeNodes(e, tag) { node -> // First check if we're in a data table, in which case don't remove us. val isDataTable: (Element) -> Boolean = { element -> getReadabilityDataTable(element) } if(hasAncestorTag(node, "table", -1, isDataTable)) { return@removeNodes false } val weight = getClassWeight(node, options) val contentScore = 0 log.debug("Cleaning Conditionally {}", node) if(weight + contentScore < 0) { return@removeNodes true } if(getCharCount(node, ',') < 10) { // If there are not very many commas, and the number of // non-paragraph elements is more than paragraphs or other // ominous signs, remove the element. val p = node.getElementsByTag("p").size val img = node.getElementsByTag("img").size val li = node.getElementsByTag("li").size - 100 val input = node.getElementsByTag("input").size var embedCount = 0 node.getElementsByTag("embed").forEach { if(regEx.isVideo(it.attr("src")) == false) { embedCount += 1 } } val linkDensity = getLinkDensity(node) val contentLength = getInnerText(node, regEx).length val haveToRemove = (img > 1 && p / img.toFloat() < 0.5 && !hasAncestorTag(node, "figure")) || (!isList && li > p) || (input > Math.floor(p/3.0)) || (!isList && contentLength < 25 && img == 0 && !hasAncestorTag(node, "figure")) || (!isList && weight < 25 && linkDensity > 0.2) || (weight >= 25 && linkDensity > 0.5) || ((embedCount == 1 && contentLength < 75) || embedCount > 1) return@removeNodes haveToRemove } return@removeNodes false } } /** * Check if a given node has one of its ancestor tag name matching the * provided one. */ protected open fun hasAncestorTag(node: Element, tagName: String, maxDepth: Int = 3, filterFn: ((Element) -> Boolean)? = null): Boolean { val tagNameLowerCase = tagName.toLowerCase() var parent = node var depth = 0 while(parent.parent() != null) { if(maxDepth > 0 && depth > maxDepth) { return false } if(parent.parent().tagName() == tagNameLowerCase && (filterFn == null || filterFn(parent.parent()))) { return true } parent = parent.parent() depth++ } return false } /** * Get the number of times a string s appears in the node e. */ protected open fun getCharCount(node: Element, c: Char = ','): Int { return getInnerText(node, regEx).split(c).size - 1 } /** * Clean a node of all elements of type "tag". * (Unless it's a youtube/vimeo video. People love movies.) */ protected open fun clean(e: Element, tag: String) { val isEmbed = EMBEDDED_NODES.contains(tag) removeNodes(e, tag) { element -> // Allow youtube and vimeo videos through as people usually want to see those. if(isEmbed) { val attributeValues = element.attributes().map { it.value }.joinToString("|") // First, check the elements attributes to see if any of them contain youtube or vimeo if(regEx.isVideo(attributeValues)) { return@removeNodes false } // Then check the elements inside this element for the same. if(regEx.isVideo(element.html())) { return@removeNodes false } } return@removeNodes true } } /** * Clean out elements whose id/class combinations match specific string. */ protected open fun cleanMatchedNodes(e: Element, regex: Regex) { val endOfSearchMarkerNode = getNextNode(e, true) var next = getNextNode(e) while(next != null && next != endOfSearchMarkerNode) { if(regex.containsMatchIn(next.className() + " " + next.id())) { next = removeAndGetNext(next, regex.pattern) } else { next = getNextNode(next) } } } /** * Clean out spurious headers from an Element. Checks things like classnames and link density. */ protected open fun cleanHeaders(e: Element, options: ArticleGrabberOptions) { Arrays.asList("h1", "h2").forEach { removeNodes(e, it) { header -> getClassWeight(header, options) < 0 } } } /* Util methods */ protected open fun removeAndGetNext(node: Element, reason: String = ""): Element? { val nextNode = this.getNextNode(node, true) printAndRemove(node, reason) return nextNode } /** * Traverse the DOM from node to node, starting at the node passed in. * Pass true for the second parameter to indicate this node itself * (and its kids) are going away, and we want the next node over. * * Calling this in a loop will traverse the DOM depth-first. */ protected open fun getNextNode(node: Element, ignoreSelfAndKids: Boolean = false): Element? { // First check for kids if those aren't being ignored if(!ignoreSelfAndKids && node.children().size > 0) { return node.child(0) } // Then for siblings... node.nextElementSibling()?.let { return it } // And finally, move up the parent chain *and* find a sibling // (because this is depth-first traversal, we will have already // seen the parent nodes themselves). var parent = node.parent() while(parent != null && parent.nextElementSibling() == null) { parent = parent.parent() } return parent?.nextElementSibling() } protected open fun getTextDirection(topCandidate: Element, doc: Document) { val ancestors = Arrays.asList(topCandidate.parent(), topCandidate).toMutableSet() ancestors.addAll(getNodeAncestors(topCandidate.parent())) ancestors.add(doc.body()) ancestors.add(doc.selectFirst("html")) // needed as dir is often set on html tag ancestors.forEach { ancestor -> val articleDir = ancestor.attr("dir") if(articleDir.isNotBlank()) { this.articleDir = articleDir return } } } protected open fun getReadabilityObject(element: Element): ReadabilityObject? { return readabilityObjects[element] } protected open fun getReadabilityDataTable(table: Element): Boolean { return this.readabilityDataTable[table] ?: false } protected open fun setReadabilityDataTable(table: Element, readabilityDataTable: Boolean) { this.readabilityDataTable.put(table, readabilityDataTable) } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy