All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.persist.model.WebPageFormatter.kt Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ai.platon.pulsar.persist.model

import ai.platon.pulsar.persist.HyperlinkPersistable
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.persist.WebPageExt
import ai.platon.pulsar.persist.gora.generated.GFieldGroup
import ai.platon.pulsar.persist.gora.generated.GHypeLink
import com.google.gson.GsonBuilder
import org.apache.commons.lang3.StringUtils
import org.apache.gora.util.ByteUtils
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import java.time.Instant
import java.time.LocalDateTime
import java.util.*
import java.util.function.Consumer
import java.util.stream.Collectors

class WebPageFormatter(val page: WebPage) {
    private var withText = false
    private var withContent = false
    private var withLinks = false
    private var withFields = false
    private var withEntities = false
    private val pageExt = WebPageExt(page)
    private val zoneId = page.zoneId

    fun withText(withText: Boolean): WebPageFormatter {
        this.withText = withText
        return this
    }

    fun withText(): WebPageFormatter {
        withText = true
        return this
    }

    fun withContent(withContent: Boolean): WebPageFormatter {
        this.withContent = withContent
        return this
    }

    fun withContent(): WebPageFormatter {
        withContent = true
        return this
    }

    fun withLinks(withLinks: Boolean): WebPageFormatter {
        this.withLinks = withLinks
        return this
    }

    fun withLinks(): WebPageFormatter {
        withLinks = true
        return this
    }

    fun withFields(withFields: Boolean): WebPageFormatter {
        this.withFields = withFields
        return this
    }

    fun withFields(): WebPageFormatter {
        withFields = true
        return this
    }

    fun withEntities(withEntities: Boolean): WebPageFormatter {
        this.withEntities = withEntities
        return this
    }

    fun withEntities(): WebPageFormatter {
        withEntities = true
        return this
    }

    fun toMap(): Map {
        val fields: MutableMap = LinkedHashMap()
        /* General */
        fields["key"] = page.key
        fields["url"] = page.url
        fields["options"] = page.args
        fields["isSeed"] = page.isSeed
        fields["createTime"] = format(page.createTime)
        fields["distance"] = page.distance
        /* Fetch */fields["crawlStatus"] = page.crawlStatus.toString()
        fields["protocolStatus"] = page.protocolStatus.name
        fields["protocolStatusMessage"] = page.protocolStatus.toString()
        if (page.content != null) {
            fields["contentLength"] = page.content!!.array().size
        }
        fields["fetchCount"] = page.fetchCount
        fields["fetchPriority"] = page.fetchPriority
        fields["fetchInterval"] = page.fetchInterval.toString()
        fields["retriesSinceFetch"] = page.fetchRetries
        fields["prevFetchTime"] = format(page.prevFetchTime)
        fields["fetchTime"] = format(page.fetchTime)
        fields["prevModifiedTime"] = format(page.prevModifiedTime)
        fields["modifiedTime"] = format(page.modifiedTime)
        fields["baseUrl"] = page.location
        fields["batchId"] = page.batchId ?: "(null)"
        /* Parse */fields["parseStatus"] = page.parseStatus.name
        fields["parseStatusMessage"] = page.parseStatus.toString()
        fields["encoding"] = page.encoding?:""
        fields["prevSignature"] = page.prevSignatureAsString
        fields["signature"] = page.signatureAsString
        fields["pageCategory"] = page.pageCategory.name
        fields["prevContentPublishTime"] = format(page.prevContentPublishTime)
        fields["contentPublishTime"] = format(page.contentPublishTime)
        fields["prevContentModifiedTime"] = format(page.prevContentModifiedTime)
        fields["contentModifiedTime"] = format(page.contentModifiedTime)
        fields["prevRefContentPublishTime"] = format(page.prevRefContentPublishTime)
        fields["refContentPublishTime"] = format(page.refContentPublishTime)
        // May be too long
// fields.put("inlinkAnchors", page.getInlinkAnchors());
        fields["pageTitle"] = page.pageTitle
        fields["contentTitle"] = page.contentTitle
        fields["inlinkAnchor"] = page.anchor
        fields["title"] = pageExt.sniffTitle()
        /* Score */fields["contentScore"] = page.contentScore.toString()
        fields["score"] = page.score.toString()
        fields["cash"] = page.cash.toString()
        fields["marks"] = page.marks.asStringMap()
        fields["pageCounters"] = page.pageCounters.asStringMap()
        fields["metadata"] = page.metadata.asStringMap()
        fields["headers"] = page.headers.asStringMap()
        fields["linkCount"] = page.links.size
        fields["vividLinkCount"] = page.vividLinks.size
        fields["liveLinkCount"] = page.liveLinks.size
        fields["deadLinkCount"] = page.deadLinks.size
        fields["inlinkCount"] = page.inlinks.size
        fields["linksMessage"] = ("Total "
                + page.links.size + " links, "
                + page.vividLinks.size + " vivid links, "
                + page.liveLinks.size + " live links, "
                + page.deadLinks.size + " dead links, "
                + page.inlinks.size + " inlinks")
        if (withLinks) {
            fields["links"] = page.links.stream().map { obj: CharSequence -> obj.toString() }.collect(Collectors.toList())
            fields["vividLinks"] = page.vividLinks.values.stream().map { obj: CharSequence -> obj.toString() }.collect(Collectors.toList())
            fields["liveLinks"] = page.liveLinks.values.stream().map { l: GHypeLink? -> HyperlinkPersistable.box(l!!).toString() }.collect(Collectors.toList())
            fields["deadLinks"] = page.deadLinks.stream().map { obj: CharSequence -> obj.toString() }.collect(Collectors.toList())
            fields["inlinks"] = page.inlinks.entries.stream()
                    .map { il: Map.Entry -> il.key.toString() + "\t" + il.value }.collect(Collectors.joining("\n"))
        }
        if (withText) {
            fields["contentText"] = page.contentText
            fields["pageText"] = page.pageText
        }
        if (withContent && page.content != null) {
            fields["content"] = page.contentAsString
        }
        if (withEntities) {
            val pageModel = page.pageModel
            if (pageModel != null) {
                val pageEntities = pageModel.unboxedFieldGroups.map { FieldGroupFormatter(it).fields.entries }
                fields["pageEntities"] = pageEntities
            }
        }
        return fields
    }

    fun toMap(fields: Set): Map {
        return toMap().entries.filter { fields.contains(it.key) }.associate { it.key to it.value }
    }

    fun format(): String {
        val sb = StringBuilder()
        sb.append("url:\t" + page.url + "\n")
                .append("baseUrl:\t" + page.location + "\n")
                .append("batchId:\t" + page.batchId + "\n")
                .append("crawlStatus:\t" + page.crawlStatus + "\n")
                .append("protocolStatus:\t" + page.protocolStatus + "\n")
                .append("depth:\t" + page.distance + "\n")
                .append("pageCategory:\t" + page.pageCategory + "\n")
                .append("fetchCount:\t" + page.fetchCount + "\n")
                .append("fetchPriority:\t" + page.fetchPriority + "\n")
                .append("fetchInterval:\t" + page.fetchInterval + "\n")
                .append("retriesSinceFetch:\t" + page.fetchRetries + "\n")
        sb.append("\n")
                .append("options:\t" + page.args + "\n")
        sb.append("\n")
                .append("createTime:\t" + format(page.createTime) + "\n")
                .append("prevFetchTime:\t" + format(page.prevFetchTime) + "\n")
                .append("fetchTime:\t" + format(page.fetchTime) + "\n")
                .append("prevModifiedTime:\t" + format(page.prevModifiedTime) + "\n")
                .append("modifiedTime:\t" + format(page.modifiedTime) + "\n")
                .append("prevContentModifiedTime:\t" + format(page.prevContentModifiedTime) + "\n")
                .append("contentModifiedTime:\t" + format(page.contentModifiedTime) + "\n")
                .append("prevContentPublishTime:\t" + format(page.prevContentPublishTime) + "\n")
                .append("contentPublishTime:\t" + format(page.contentPublishTime) + "\n")
        sb.append("\n")
                .append("prevRefContentPublishTime:\t" + format(page.prevRefContentPublishTime) + "\n")
                .append("refContentPublishTime:\t" + format(page.refContentPublishTime) + "\n")
        sb.append("\n")
                .append("pageTitle:\t" + page.pageTitle + "\n")
                .append("contentTitle:\t" + page.contentTitle + "\n")
                .append("anchor:\t" + page.anchor + "\n")
                .append("title:\t" + pageExt.sniffTitle() + "\n")
        sb.append("\n")
                .append("parseStatus:\t" + page.parseStatus.toString() + "\n")
                .append("prevSignature:\t" + page.prevSignatureAsString + "\n")
                .append("signature:\t" + page.signatureAsString + "\n")
                .append("contentScore:\t" + page.contentScore + "\n")
                .append("score:\t" + page.score + "\n")
                .append("cash:\t" + page.cash + "\n")

        val crawlMarks = page.marks
        if (crawlMarks.unbox().isNotEmpty()) {
            sb.append("\n")
            crawlMarks.unbox().forEach { (key, value) -> sb.append("mark $key:\t$value\n") }
        }
        if (page.pageCounters.unbox().isNotEmpty()) {
            sb.append("\n")
            page.pageCounters.unbox().forEach { (key, value) -> sb.append("counter $key : $value\n") }
        }
        val metadata = page.metadata.asStringMap()
        if (metadata.isNotEmpty()) {
            sb.append("\n")
            metadata.entries.stream().filter { it.value.startsWith("meta_") }
                    .forEach { (key, value) -> sb.append("metadata " + key + ":\t" + value + "\n") }
            metadata.entries.stream().filter { e -> e.value.startsWith("meta_") }
                    .forEach { (key, value) -> sb.append("metadata " + key + ":\t" + value + "\n") }
        }
        val headers = page.headers.unbox()
        if (headers != null && headers.isNotEmpty()) {
            sb.append("\n")
            headers.forEach { (key, value) -> sb.append("header $key:\t$value\n") }
        }

        sb.append("\n")
        sb.append("Total " + page.links.size + " links, ")
                .append(page.vividLinks.size.toString() + " vivid links, ")
                .append(page.liveLinks.size.toString() + " live links, ")
                .append(page.deadLinks.size.toString() + " dead links, ")
                .append(page.inlinks.size.toString() + " inlinks\n")

        if (withLinks) {
            sb.append("\n")
            sb.append("links:\n")
            page.links.forEach { l -> sb.append("links:\t$l\n") }
            sb.append("vividLinks:\n")
            page.vividLinks.forEach { (k, v) -> sb.append("liveLinks:\t$k\t$v\n") }
            sb.append("liveLinks:\n")
            page.liveLinks.values.forEach(Consumer { e: GHypeLink -> sb.append("liveLinks:\t" + e.url + "\t" + e.anchor + "\n") })
            sb.append("deadLinks:\n")
            page.deadLinks.forEach(Consumer { l: CharSequence -> sb.append("deadLinks:\t$l\n") })
            sb.append("inlinks:\n")
            page.inlinks.forEach { (key, value) -> sb.append("inlink:\t$key\t$value\n") }
        }

        if (withContent) {
            val content = page.content
            if (content != null) {
                sb.append("\n")
                sb.append("contentType:\t" + page.contentType + "\n")
                        .append("content:START>>>\n")
                        .append(ByteUtils.toString(content.array()))
                        .append("\n<< 0) {
                sb.append("\n")
                sb.append("contentText:START>>>\n")
                        .append(page.contentText)
                        .append("\n<< 0) {
                sb.append("pageText:START>>>\n")
                        .append(page.pageText)
                        .append("\n<<>>\n")
                pageModel.unboxedFieldGroups
                    .flatMap { FieldGroupFormatter(it).fields.entries }
                    .joinTo(sb) { it.key + ": " + it.value }
                sb.append("\n<<




© 2015 - 2024 Weber Informatics LLC | Privacy Policy