All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.common.LinkExtractors.kt Maven / Gradle / Ivy

There is a newer version: 2.1.0
Show newest version
package ai.platon.pulsar.common

import java.io.IOException
import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.Paths
import java.util.regex.Pattern

/**
 * TODO: org.nibor.autolink.LinkExtractor might be faster
 * */
open class UrlExtractor {
    companion object {
        
        /**
         * TODO: see https://github.com/aosp-mirror/platform_frameworks_base/blob/master/core/java/android/util/Patterns.java
         * */
        val URL_PATTERN: Pattern = Pattern.compile(
            "(?:^|[\\W])((ht|f)tp(s?):\\/\\/|www\\.)"
                    + "(([\\w\\-]+\\.){1,}?([\\w\\-.~]+\\/?)*"
                    + "[\\p{Alnum}.,%_=?&#\\-+()\\[\\]\\*$~@!:/{};']*)",
            Pattern.CASE_INSENSITIVE or Pattern.MULTILINE or Pattern.DOTALL)
    }

    fun extract(line: String): String? {
        val matcher = URL_PATTERN.matcher(line)
        while (matcher.find()) {
            val start = matcher.start(1)
            val end = matcher.end()
            return line.substring(start, end)
        }
        return null
    }

    fun extractTo(line: String, urls: MutableSet) {
        val matcher = URL_PATTERN.matcher(line)
        while (matcher.find()) {
            val start = matcher.start(1)
            val end = matcher.end()
            urls.add(line.substring(start, end))
        }
    }
}

internal class ResourceExtractor(
    val resource: String,
    val filter: (String) -> Boolean = { true }
): UrlExtractor() {
    fun extract(): Set {
        val urls = mutableSetOf()
        ResourceLoader.readAllLines(resource, filter).forEach { extractTo(it, urls) }
        return urls
    }
}

internal class FileExtractor(
    val path: Path,
    val filter: (String) -> Boolean = { true }
): UrlExtractor() {
    fun extract(): Set {
        if (!Files.exists(path)) {
            return setOf()
        }

        val urls = mutableSetOf()
        Files.readAllLines(path).filter(filter).forEach { extractTo(it, urls) }
        return urls
    }
}

internal class DirectoryExtractor(
    val baseDir: Path,
    val filter: (String) -> Boolean = { true }
): UrlExtractor() {
    fun extract(): Set {
        if (!Files.exists(baseDir)) {
            return setOf()
        }

        val urls = mutableSetOf()
        Files.list(baseDir).filter { Files.isRegularFile(it) }.forEach { path ->
            Files.newBufferedReader(path).forEachLine {
                extractTo(it, urls)
            }
        }
        return urls
    }
}

object LinkExtractors {
    @JvmStatic
    fun fromResource(resource: String) = ResourceExtractor(resource).extract()
    @JvmStatic
    fun fromFile(path: Path) = FileExtractor(path).extract()
    @JvmStatic
    fun fromFile(path: String) = FileExtractor(Paths.get(path)).extract()
    @JvmStatic
    fun fromDirectory(baseDir: Path) = DirectoryExtractor(baseDir).extract()
    @JvmStatic
    fun fromDirectory(baseDir: String) = DirectoryExtractor(Paths.get(baseDir)).extract()
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy