net.dankito.readability4j.Article.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of readability4j Show documentation
Show all versions of readability4j Show documentation
A Kotlin port of Mozilla‘s Readability. It extracts a website‘s relevant content and removes all clutter from it.
package net.dankito.readability4j
import org.jsoup.nodes.Element
open class Article(
/**
* Original uri object that was passed to constructor
*/
val uri: String
) {
/**
* Article title
*/
var title: String? = null
var articleContent: Element? = null
/**
* HTML string of processed article content in a <div> element.
*
* Therefore no encoding is applied, see [contentWithUtf8Encoding] or issue
* [https://github.com/dankito/Readability4J/issues/1].
*/
val content: String?
get() = articleContent?.html() // TODO: but this removes paging information (pages in top node )
/**
* [content] returns a <div> element.
*
* As the only way in HTML to set an encoding is via <head><meta charset=""> tag, therefore no explicit
* encoding is applied to it.
* As a result non-ASCII characters may get displayed incorrectly.
*
* So this method wraps [content] in <html><head><meta charset="utf-8"/></head><body><!--
* content--></body></html> so that UTF-8 encoding gets applied.
*
* See [https://github.com/dankito/Readability4J/issues/1] for more info.
*/
val contentWithUtf8Encoding: String?
get() = getContentWithEncoding("utf-8")
/**
* Returns the content wrapped in an element with charset set to document's charset. Or if that is not set in UTF-8.
* See [contentWithUtf8Encoding] for more details.
*/
val contentWithArticlesEncodingOrUtf8: String?
get() = getContentWithEncoding(charset ?: "utf-8")
val textContent: String?
get() = articleContent?.text()
/**
* Length of article, in characters
*/
var length: Int = -1
get() = textContent?.length ?: -1
/**
* Article description, or short excerpt from content
*/
var excerpt: String? = null
/**
* Author metadata
*/
var byline: String? = null
/**
* Content direction
*/
var dir: String? = null
/**
* Article's charset
*/
var charset: String? = null
/**
* [content] returns a <div> element.
*
* As the only way in HTML to set an encoding is via <head><meta charset=""> tag, therefore no explicit
* encoding is applied to it.
* As a result non-ASCII characters may get displayed incorrectly.
*
* So this method wraps [content] in <html><head><meta charset="[encoding]"/></head><body><!--
* content--></body></html> so that encoding gets applied.
*
* See [https://github.com/dankito/Readability4J/issues/1] for more info.
*/
fun getContentWithEncoding(encoding: String): String? {
content?.let { content ->
return "\n \n \n \n \n " +
"$content\n \n"
}
return null
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy