All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.crawl.protocol.ProtocolFactory.kt Maven / Gradle / Ivy

There is a newer version: 2.1.0
Show newest version
package ai.platon.pulsar.skeleton.crawl.protocol

import ai.platon.pulsar.common.ResourceLoader
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.common.stringify
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.persist.metadata.FetchMode
import org.apache.commons.lang3.StringUtils
import org.slf4j.LoggerFactory
import java.util.*
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.atomic.AtomicBoolean

/**
 * Creates and caches [Protocol] plugins. Protocol plugins should define
 * the attribute "protocolName" with the name of the protocol that they
 * implement. Configuration object is used for caching. Cache key is constructed
 * from appending protocol name (eg. http) to constant
 */
class ProtocolFactory(private val immutableConfig: ImmutableConfig) : AutoCloseable {
    private val logger = LoggerFactory.getLogger(ProtocolFactory::class.java)
    
    private val protocols: MutableMap = ConcurrentHashMap()
    private val closed = AtomicBoolean()
    
    init {
        ResourceLoader.readAllLines("protocol-plugins.txt")
            .asSequence()
            .map { it.trim() }
            .filterNot { it.startsWith("#") }
            .map { it.split("\\s+".toRegex()) }
            .filter { it.size >= 2 }
            .map { it[0] to getInstance(it) }
            .filter { it.second != null }
            .associate { it.first to it.second!! }
            .onEach { it.value.configure(immutableConfig) }
            .toMap(protocols)
        protocols.keys.joinToString(", ", "Supported protocols: ", "")
            .also { logger.info(it) }
    }
    
    /**
     * TODO: configurable, using major protocol/sub protocol is a good idea
     * Using major protocol/sub protocol is a good idea, for example:
     * selenium:http://www.baidu.com/
     * jdbc:h2:tcp://localhost/~/test
     */
    fun getProtocol(page: WebPage): Protocol {
        val fetchMode = page.fetchMode.takeIf { it != FetchMode.UNKNOWN } ?: FetchMode.BROWSER
        page.fetchMode = fetchMode
        
        return when (fetchMode) {
            FetchMode.BROWSER -> getProtocol("browser:" + page.url)
            else -> getProtocol(page.url)
        } ?: throw ProtocolNotFound(page.url)
    }
    
    /**
     * Returns the appropriate [Protocol] implementation for a url.
     *
     * @param url The url
     * @return The appropriate [Protocol] implementation for a given
     * [url].
     */
    fun getProtocol(url: String): Protocol? {
        val protocolName = StringUtils.substringBefore(url, ":")
        // sub protocol can be supported by main:sub://example.com later
        return protocols[protocolName]
    }
    
    fun getProtocol(mode: FetchMode): Protocol? {
        return getProtocol(mode.name.lowercase(Locale.getDefault()) + "://")
    }
    
    private fun getInstance(config: List): Protocol? {
        try {
            // config[0] is the protocol name, config[1] is the class name, and the rest are properties
            val className = config[1]
            return Class.forName(className).constructors.first().newInstance() as Protocol
        } catch (e: ClassNotFoundException) {
            logger.error(e.stringify())
        } catch (e: InstantiationException) {
            logger.error(e.stringify())
        } catch (e: IllegalAccessException) {
            logger.error(e.stringify())
        }
        return null
    }
    
    override fun close() {
        if (closed.compareAndSet(false, true)) {
            protocols.values.forEach { protocol: Protocol ->
                try {
                    protocol.close()
                } catch (e: Throwable) {
                    logger.error(e.toString())
                }
            }
            protocols.clear()
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy