net.dankito.readability4j.processor.MetadataParser.kt Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of Readability4J Show documentation
A Kotlin port of Mozilla‘s Readability. It extracts a website‘s relevant content and removes all clutter from it.
The newest version!
package net.dankito.readability4j.processor

import net.dankito.readability4j.model.ArticleMetadata
import net.dankito.readability4j.util.RegExUtil
import org.jsoup.nodes.Document
import java.util.regex.Pattern


open class MetadataParser(protected val regEx: RegExUtil = RegExUtil()): ProcessorBase() {


    open fun getArticleMetadata(document: Document): ArticleMetadata {
        val metadata = ArticleMetadata()
        val values = HashMap()

        // Match "description", or Twitter's "twitter:description" (Cards)
        // in name attribute.
        val namePattern = Pattern.compile("^\\s*((twitter)\\s*:\\s*)?(description|title)\\s*$", Pattern.CASE_INSENSITIVE)

        // Match Facebook's Open Graph title & description properties.
        val propertyPattern = Pattern.compile("^\\s*og\\s*:\\s*(description|title)\\s*$", Pattern.CASE_INSENSITIVE)

        document.select("meta").forEach { element ->
            val elementName = element.attr("name")
            val elementProperty = element.attr("property")

            if(elementName == "author" || elementProperty == "author") {
                metadata.byline = element.attr("content")
                return@forEach
            }

            var name: String? = null
            if(namePattern.matcher(elementName).find()) {
                name = elementName
            }
            else if(propertyPattern.matcher(elementProperty).find()) {
                name = elementProperty
            }

            if(name != null) {
                val content = element.attr("content")
                if(content.isNullOrBlank() == false) {
                    // Convert to lowercase and remove any whitespace
                    // so we can match below.
                    name = name.toLowerCase().replace("\\s".toRegex(), "")
                    values[name] = content.trim().replace("  ", " ")
                }
            }
        }

        metadata.excerpt = values["description"] ?:
                           values["og:description"] ?: // Use facebook open graph description.
                           values["twitter:description"] // Use twitter cards description.

        metadata.title = getArticleTitle(document)
        if(metadata.title.isNullOrBlank()) {
            metadata.title = values["og:title"] ?: // Use facebook open graph title.
                            values["twitter:title"] // Use twitter cards title.
                            ?: ""
        }

        return metadata
    }

    protected open fun getArticleTitle(doc: Document): String {
        var curTitle = ""
        var origTitle = ""

        try {
            origTitle = doc.title()
            curTitle = origTitle

            // If they had an element with id "title" in their HTML
            if(curTitle.isBlank()) {
                doc.select("#title").first()?.let { elementWithIdTitle ->
                    origTitle = getInnerText(elementWithIdTitle, regEx)
                    curTitle = origTitle
                }
            }
        } catch(e: Exception) {/* ignore exceptions setting the title. */}

        var titleHadHierarchicalSeparators = false

        // If there's a separator in the title, first remove the final part
        if(curTitle.contains(" [\\|\\-\\/>»] ".toRegex())) {
            titleHadHierarchicalSeparators = curTitle.contains(" [\\/>»] ".toRegex())
            curTitle = origTitle.replace("(.*)[\\|\\-\\/>»] .*".toRegex(RegexOption.IGNORE_CASE), "$1")

            // If the resulting title is too short (3 words or fewer), remove
            // the first part instead:
            if(wordCount(curTitle) < 3) {
                curTitle = origTitle.replace("[^\\|\\-\\/>»]*[\\|\\-\\/>»](.*)".toRegex(RegexOption.IGNORE_CASE), "$1")
            }
        }
        else if(curTitle.contains(": ")) {
            // Check if we have an heading containing this exact string, so we
            // could assume it's the full title.
            val match = doc.select("h1, h2").filter { it.wholeText() == curTitle }.size > 0

            // If we don't, let's extract the title out of the original title string.
            if(match == false) {
                curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1)

                // If the title is now too short, try the first colon instead:
                if(wordCount(curTitle) < 3) {
                    curTitle = origTitle.substring(origTitle.indexOf(':') + 1)
                }
                // But if we have too many words before the colon there's something weird
                // with the titles and the H tags so let's just use the original title instead
                else if(wordCount(origTitle.substring(0, origTitle.indexOf(':'))) > 5) {
                    curTitle = origTitle
                }
            }
        }
        else if(curTitle.length > 150 || curTitle.length < 15) {
            val hOnes = doc.getElementsByTag("h1")

            if(hOnes.size == 1) {
                curTitle = getInnerText(hOnes[0], regEx)
            }
        }

        curTitle = curTitle.trim()
        // If we now have 4 words or fewer as our title, and either no
        // 'hierarchical' separators (\, /, > or ») were found in the original
        // title or we decreased the number of words by more than 1 word, use
        // the original title.
        val curTitleWordCount = wordCount(curTitle)
        if(curTitleWordCount <= 4 &&
            (!titleHadHierarchicalSeparators ||
            curTitleWordCount != wordCount(origTitle.replace("[\\|\\-\\/>»]+".toRegex(), "")) - 1)) {
            curTitle = origTitle
        }

        return curTitle
    }

    protected open fun wordCount(str: String): Int {
        return str.split("\\s+".toRegex()).size
    }

}