net.dankito.readability4j.extended.util.RegExUtilExtended.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of readability4j Show documentation
Show all versions of readability4j Show documentation
A Kotlin port of Mozilla‘s Readability. It extracts a website‘s relevant content and removes all clutter from it.
package net.dankito.readability4j.extended.util
import net.dankito.readability4j.util.RegExUtil
import java.util.regex.Pattern
open class RegExUtilExtended : RegExUtil {
companion object {
const val RemoveImageDefaultPattern = "author|avatar|thumbnail" // CHANGE: this is not in Mozilla's Readability
const val NegativeDefaultPatternExtended = "|float"
}
protected val removeImage: Pattern
constructor(unlikelyCandidatesPattern: String = UnlikelyCandidatesDefaultPattern, okMaybeItsACandidatePattern: String = OkMaybeItsACandidateDefaultPattern,
positivePattern: String = PositiveDefaultPattern, negativePattern: String = NegativeDefaultPattern + NegativeDefaultPatternExtended,
extraneousPattern: String = ExtraneousDefaultPattern, bylinePattern: String = BylineDefaultPattern,
replaceFontsPattern: String = ReplaceFontsDefaultPattern, normalizePattern: String = NormalizeDefaultPattern,
videosPattern: String = VideosDefaultPattern, nextLinkPattern: String = NextLinkDefaultPattern,
prevLinkPattern: String = PrevLinkDefaultPattern, whitespacePattern: String = WhitespaceDefaultPattern,
hasContentPattern: String = HasContentDefaultPattern, removeImagePattern: String = RemoveImageDefaultPattern)
: super(unlikelyCandidatesPattern, okMaybeItsACandidatePattern, positivePattern, negativePattern, extraneousPattern, bylinePattern, replaceFontsPattern, normalizePattern,
videosPattern, nextLinkPattern, prevLinkPattern, whitespacePattern, hasContentPattern) {
this.removeImage = Pattern.compile(removeImagePattern)
}
open fun keepImage(matchString: String): Boolean { // CHANGE: this is not in Mozilla's Readability
if((isNegative(matchString) && isPositive(matchString) == false) || removeImage.matcher(matchString).find()) {
return false
}
return true
}
}