All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.dankito.readability4j.Article.kt Maven / Gradle / Ivy

Go to download

A Kotlin port of Mozilla‘s Readability. It extracts a website‘s relevant content and removes all clutter from it.

There is a newer version: 1.0.8
Show newest version
package net.dankito.readability4j

import org.jsoup.nodes.Element


open class Article(

        /**
         * Original uri object that was passed to constructor
         */
        val uri: String

) {

        /**
         * Article title
         */
        var title: String? = null

        var articleContent: Element? = null

        /**
         * HTML string of processed article content in a <div> element.
         *
         * Therefore no encoding is applied, see [contentWithUtf8Encoding] or issue
         * [https://github.com/dankito/Readability4J/issues/1].
         */
        val content: String?
                get() = articleContent?.html() // TODO: but this removes paging information (pages in top node 
) /** * [content] returns a <div> element. * * As the only way in HTML to set an encoding is via <head><meta charset=""> tag, therefore no explicit * encoding is applied to it. * As a result non-ASCII characters may get displayed incorrectly. * * So this method wraps [content] in <html><head><meta charset="utf-8"/></head><body><!-- * content--></body></html> so that UTF-8 encoding gets applied. * * See [https://github.com/dankito/Readability4J/issues/1] for more info. */ val contentWithUtf8Encoding: String? get() = getContentWithEncoding("utf-8") /** * Returns the content wrapped in an element with charset set to document's charset. Or if that is not set in UTF-8. * See [contentWithUtf8Encoding] for more details. */ val contentWithArticlesEncodingOrUtf8: String? get() = getContentWithEncoding(charset ?: "utf-8") val textContent: String? get() = articleContent?.text() /** * Length of article, in characters */ var length: Int = -1 get() = textContent?.length ?: -1 /** * Article description, or short excerpt from content */ var excerpt: String? = null /** * Author metadata */ var byline: String? = null /** * Content direction */ var dir: String? = null /** * Article's charset */ var charset: String? = null /** * [content] returns a <div> element. * * As the only way in HTML to set an encoding is via <head><meta charset=""> tag, therefore no explicit * encoding is applied to it. * As a result non-ASCII characters may get displayed incorrectly. * * So this method wraps [content] in <html><head><meta charset="[encoding]"/></head><body><!-- * content--></body></html> so that encoding gets applied. * * See [https://github.com/dankito/Readability4J/issues/1] for more info. */ fun getContentWithEncoding(encoding: String): String? { content?.let { content -> return "\n \n \n \n \n " + "$content\n \n" } return null } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy