All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.dankito.readability4j.extended.util.RegExUtilExtended.kt Maven / Gradle / Ivy

Go to download

A Kotlin port of Mozilla‘s Readability. It extracts a website‘s relevant content and removes all clutter from it.

The newest version!
package net.dankito.readability4j.extended.util

import net.dankito.readability4j.util.RegExUtil
import java.util.regex.Pattern


open class RegExUtilExtended : RegExUtil {

    companion object {
        const val RemoveImageDefaultPattern = "author|avatar|thumbnail" // CHANGE: this is not in Mozilla's Readability

        const val NegativeDefaultPatternExtended = "|float"
    }


    protected val removeImage: Pattern


    constructor(unlikelyCandidatesPattern: String = UnlikelyCandidatesDefaultPattern, okMaybeItsACandidatePattern: String = OkMaybeItsACandidateDefaultPattern,
                positivePattern: String = PositiveDefaultPattern, negativePattern: String = NegativeDefaultPattern + NegativeDefaultPatternExtended,
                extraneousPattern: String = ExtraneousDefaultPattern, bylinePattern: String = BylineDefaultPattern,
                replaceFontsPattern: String = ReplaceFontsDefaultPattern, normalizePattern: String = NormalizeDefaultPattern,
                videosPattern: String = VideosDefaultPattern, nextLinkPattern: String = NextLinkDefaultPattern,
                prevLinkPattern: String = PrevLinkDefaultPattern, whitespacePattern: String = WhitespaceDefaultPattern,
                hasContentPattern: String = HasContentDefaultPattern, removeImagePattern: String = RemoveImageDefaultPattern)
    : super(unlikelyCandidatesPattern, okMaybeItsACandidatePattern, positivePattern, negativePattern, extraneousPattern, bylinePattern, replaceFontsPattern, normalizePattern,
            videosPattern, nextLinkPattern, prevLinkPattern, whitespacePattern, hasContentPattern) {
        this.removeImage = Pattern.compile(removeImagePattern)
    }


    open fun keepImage(matchString: String): Boolean { // CHANGE: this is not in Mozilla's Readability
        if((isNegative(matchString) && isPositive(matchString) == false) || removeImage.matcher(matchString).find()) {
            return false
        }

        return true
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy