net.dankito.readability4j.processor.Preprocessor.kt Maven / Gradle / Ivy
Show all versions of Readability4J Show documentation
package net.dankito.readability4j.processor
import net.dankito.readability4j.util.RegExUtil
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.nodes.Node
import org.slf4j.LoggerFactory
/**
* Performs basic sanitization before starting the extraction process.
*/
open class Preprocessor(protected val regEx: RegExUtil = RegExUtil()) : ProcessorBase() {
companion object {
private val log = LoggerFactory.getLogger(Preprocessor::class.java)
}
/**
* Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup.
*/
open fun prepareDocument(document: Document) {
log.debug("Starting to prepare document")
removeScripts(document)
removeStyles(document)
removeForms(document) // TODO: this is not in Mozilla's Readability
removeComments(document) // TODO: this is not in Mozilla's Readability
replaceBrs(document, regEx)
replaceNodes(document, "font", "span")
}
protected open fun removeScripts(document: Document) {
removeNodes(document, "script") { scriptNode ->
scriptNode.`val`(null) // TODO: what is this good for?
scriptNode.removeAttr("src")
true
}
// element.getElementsByTag("script").forEach { script ->
// printAndRemove(log, script, "removeScripts('script')")
// }
document.getElementsByTag("noscript").forEach { noscript ->
if(shouldKeepImageInNoscriptElement(document, noscript)) { // TODO: this is not in Mozilla's Readability
noscript.unwrap()
}
else {
printAndRemove(noscript, "removeScripts('noscript')")
}
}
}
protected open fun shouldKeepImageInNoscriptElement(document: Document, noscript: Element): Boolean {
val images = noscript.select("img")
if(images.size > 0) {
val imagesToKeep = ArrayList(images)
images.forEach { image ->
if(document.select("img[src=${image.attr("src")}]").size > 0) {
imagesToKeep.remove(image)
}
}
return imagesToKeep.size > 0
}
return false
}
protected open fun removeStyles(document: Document) {
removeNodes(document, "style")
}
protected open fun removeForms(document: Document) {
removeNodes(document, "form")
}
protected open fun removeComments(node: Node) {
var i = 0
while (i < node.childNodeSize()) {
val child = node.childNode(i)
if(child.nodeName() == "#comment") {
printAndRemove(child, "removeComments")
}
else {
removeComments(child)
i++
}
}
}
/**
* Replaces 2 or more successive
elements with a single .
* Whitespace between
elements are ignored. For example:
*
foo
bar
abc
* will become:
* foo
barabc
*/
protected open fun replaceBrs(document: Document, regEx: RegExUtil) {
document.body().select("br").forEach { br ->
var next: Node? = br.nextSibling()
// Whether 2 or more
elements have been found and replaced with a
// block.
var replaced = false
// If we find a
chain, remove the
s until we hit another element
// or non-whitespace. This leaves behind the first
in the chain
// (which will be replaced with a
later).
next = nextElement(next, regEx)
while(next != null && next.nodeName() == "br") {
replaced = true
val brSibling = (next as? Element)?.nextSibling()
printAndRemove(next, "replaceBrs")
next = nextElement(brSibling, regEx)
}
// If we removed a
chain, replace the remaining
with a
. Add
// all sibling nodes as children of the
until we hit another
// chain.
if(replaced) {
val p = br.ownerDocument().createElement("p")
br.replaceWith(p)
next = p.nextSibling()
while(next != null) {
// If we've hit another
, we're done adding children to this
.
if(next.nodeName() == "br") {
val nextElem = this.nextElement(next, regEx)
if(nextElem != null && nextElem.tagName() == "br")
break
}
// Otherwise, make this node a child of the new
.
val sibling = next.nextSibling()
p.appendChild(next)
next = sibling
}
}
}
}
}