All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.signature.TextProfileSignature.kt Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package ai.platon.pulsar.skeleton.signature

import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.persist.WebPage
import org.apache.hadoop.io.MD5Hash
import java.util.*

/**
 * An implementation of a page signature. It calculates an MD5 hash of a plain
 * signature "profile" of a page. In case there is no signature, it calculates a hash
 * using the [MD5Signature].
 *
 * The algorithm to calculate a page "profile" takes the plain signature version of a
 * page and performs the following steps:
 *
 *  * remove all characters except letters and digits, and bring all characters to lower case,
 *  * split the signature into tokens (all consecutive non-whitespace characters),
 *  * discard tokens equal or shorter than MIN_TOKEN_LEN (default 2 characters),
 *  * sort the list of tokens by decreasing frequency,
 *  * round down the counts of tokens to the nearest multiple of QUANT (
 *  `QUANT = QUANT_RATE * maxFreq`, where `QUANT_RATE` is
 * 0.01f by default, and `maxFreq` is the maximum token frequency).
 * If `maxFreq` is higher than 1, then QUANT is always higher than 2
 * (which means that tokens with frequency 1 are always discarded).
 *  * tokens, which frequency after quantization falls below QUANT, are
 * discarded.
 *  * create a list of tokens and their quantized frequency, separated by
 * spaces, in the order of decreasing frequency.
 *
 * This list is then submitted to an MD5 hash calculation.
 *
 */
class TextProfileSignature(conf: ImmutableConfig) : Signature() {

    private var MIN_TOKEN_LEN = 2
    private var QUANT_RATE = 0.01f

    private val fallback = MD5Signature()

    init {
        MIN_TOKEN_LEN = conf.getInt("db.signature.text_profile.min_token_len", 2)
        QUANT_RATE = conf.getFloat("db.signature.text_profile.quant_rate", 0.01f)
    }

    override fun calculate(page: WebPage): ByteArray {
        val tokens = HashMap()
        // String text = page.getPageText();
        var text = page.contentText
        if (text.isEmpty() || text.length < GOOD_CONTENT_TEXT_LENGTH) {
            text = page.pageText
        }

        if (text.isEmpty()) {
            return fallback.calculate(page)
        }

        val curToken = StringBuilder()
        var maxFreq = 0
        for (i in 0 until text.length) {
            val c = text[i]
            if (Character.isLetterOrDigit(c)) {
                curToken.append(Character.toLowerCase(c))
            } else {
                if (curToken.length > 0) {
                    if (curToken.length > MIN_TOKEN_LEN) {
                        // add it
                        val s = curToken.toString()
                        var tok: Token? = tokens[s]
                        if (tok == null) {
                            tok = Token(0, s)
                            tokens[s] = tok
                        }
                        tok.cnt++
                        if (tok.cnt > maxFreq)
                            maxFreq = tok.cnt
                    }
                    curToken.setLength(0)
                }
            }
        }

        // check the last token
        if (curToken.length > MIN_TOKEN_LEN) {
            // add it
            val s = curToken.toString()
            var tok: Token? = tokens[s]
            if (tok == null) {
                tok = Token(0, s)
                tokens[s] = tok
            }
            tok.cnt++
            if (tok.cnt > maxFreq)
                maxFreq = tok.cnt
        }
        var it = tokens.values.iterator()
        val profile = ArrayList()
        // calculate the QUANT value
        var QUANT = Math.round(maxFreq * QUANT_RATE)
        if (QUANT < 2) {
            if (maxFreq > 1)
                QUANT = 2
            else
                QUANT = 1
        }
        while (it.hasNext()) {
            val t = it.next()
            // round down to the nearest QUANT
            t.cnt = t.cnt / QUANT * QUANT
            // discard the frequencies below the QUANT
            if (t.cnt < QUANT) {
                continue
            }
            profile.add(t)
        }

        profile.sortWith(TokenComparator())
        val newText = StringBuilder()
        it = profile.iterator()
        while (it.hasNext()) {
            val t = it.next()
            if (newText.length > 0)
                newText.append("\n")
            newText.append(t.toString())
        }
        return MD5Hash.digest(newText.toString()).digest
    }

    private class Token(var cnt: Int, var val_: String) {

        override fun toString(): String {
            return "$val_ $cnt"
        }
    }

    private class TokenComparator : Comparator {
        override fun compare(t1: Token, t2: Token): Int {
            return t2.cnt - t1.cnt
        }
    }

    companion object {
        var GOOD_CONTENT_TEXT_LENGTH = 2000
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy